# SVM:- Support Vector Machine

In [77]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import string
import joblib
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold


In [47]:
bbc_data=pd.read_csv("C:\\Users\\ajlad\Downloads\\bbc_data.csv")

In [69]:
bbc_data['labels'].unique()

array(['entertainment', 'business', 'sport', 'politics', 'tech'],
      dtype=object)

In [48]:
bbc_data

Unnamed: 0,data,labels
0,Musicians to tackle US red tape Musicians gro...,entertainment
1,"U2s desire to be number one U2, who have won ...",entertainment
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment
3,Snicket tops US box office chart The film ada...,entertainment
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment
...,...,...
2220,Warning over Windows Word files Writing a Mic...,tech
2221,Fast lifts rise into record books Two high-sp...,tech
2222,Nintendo adds media playing to DS Nintendo is...,tech
2223,Fast moving phone viruses appear Security fir...,tech


In [49]:
bbc_data.isnull().sum()

data      0
labels    0
dtype: int64

In [50]:
bbc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   data    2225 non-null   object
 1   labels  2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


In [51]:
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [52]:
x=bbc_data["data"].apply(preprocess)
y=bbc_data["labels"]

In [53]:
x

0       musicians tackle us red tape musicians groups ...
1       u2s desire number one u2 three prestigious gra...
2       rocker doherty onstage fight rock singer pete ...
3       snicket tops us box office chart film adaptati...
4       oceans twelve raids box office oceans twelve c...
                              ...                        
2221    fast lifts rise record books two highspeed lif...
2222    nintendo adds media playing ds nintendo releas...
2223    fast moving phone viruses appear security firm...
2224    hacker threat apples itunes users apples music...
Name: data, Length: 2225, dtype: object

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [55]:
vectorizer=TfidfVectorizer()
x = vectorizer.fit_transform(bbc_data['data'])

In [87]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)
from sklearn.svm import LinearSVC
sv_model=LinearSVC(penalty='l2', dual=False, random_state=42)

In [88]:
sv_model.fit(x_train,y_train)

In [89]:
sv_model.predict(x_test)

array(['politics', 'tech', 'sport', 'business', 'business', 'business',
       'politics', 'sport', 'sport', 'entertainment', 'entertainment',
       'tech', 'tech', 'politics', 'business', 'entertainment',
       'politics', 'tech', 'sport', 'business', 'sport', 'tech', 'tech',
       'politics', 'entertainment', 'politics', 'business', 'sport',
       'tech', 'tech', 'sport', 'politics', 'politics', 'tech', 'tech',
       'sport', 'sport', 'politics', 'business', 'sport', 'sport',
       'sport', 'politics', 'sport', 'sport', 'tech', 'politics', 'sport',
       'entertainment', 'politics', 'tech', 'business', 'politics',
       'tech', 'sport', 'politics', 'business', 'tech', 'tech', 'tech',
       'business', 'entertainment', 'politics', 'tech', 'business',
       'sport', 'tech', 'business', 'politics', 'entertainment', 'tech',
       'sport', 'entertainment', 'entertainment', 'tech', 'entertainment',
       'business', 'tech', 'entertainment', 'tech', 'entertainment',
       'spor

In [90]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,sv_model.predict(x_test))

0.9842696629213483

In [92]:
# Predict on training set
y_train_pred = sv_model.predict(x_train)
train_accuracy = accuracy_score(y_train, y_train_pred)

# Predict on validation set
y_val_pred = sv_model.predict(x_test)
val_accuracy = accuracy_score(y_test, y_val_pred)

print(f'Training Accuracy: {train_accuracy}')
print(f'Validation Accuracy: {val_accuracy}')

Training Accuracy: 1.0
Validation Accuracy: 0.9842696629213483


In [93]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold cross-validation
from sklearn.model_selection import cross_val_score
# Perform k-fold cross-validation
cross_val_scores = cross_val_score(sv_model, x, y, cv=kf)
print("Cross-validation scores:", cross_val_scores)
print("Mean CV accuracy:", cross_val_scores.mean())

Cross-validation scores: [0.98202247 0.97752809 0.97078652 0.97752809 0.98651685]
Mean CV accuracy: 0.978876404494382


In [94]:
joblib.dump(sv_model,'sv_model.pkl')

['sv_model.pkl']