# SVM:- Support Vector Machine

In [41]:
import re
import nltk
import joblib
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from gensim.models import Word2Vec

In [31]:
bbc_data=pd.read_csv("C:\\Users\\ajlad\\Downloads\\bbc_data.csv")

In [35]:
print(bbc_data.head())
print(bbc_data.dtypes)

                                                data         labels
0  Musicians to tackle US red tape  Musicians gro...  entertainment
1  U2s desire to be number one  U2, who have won ...  entertainment
2  Rocker Doherty in on-stage fight  Rock singer ...  entertainment
3  Snicket tops US box office chart  The film ada...  entertainment
4  Oceans Twelve raids box office  Oceans Twelve,...  entertainment
data      object
labels    object
dtype: object


In [36]:
x=bbc_data["data"]
y=bbc_data["labels"]

In [37]:
# Define the preprocessing function
def preprocess_text(text):
    # Ensure text is a string
    if not isinstance(text, str):
        return []
    
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize and remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in text.split() if word not in stop_words]
    
    return tokens

In [38]:
bbc_data['data'] = bbc_data['data'].apply(preprocess_text)

In [42]:
# Initialize and train the Word2Vec model
word2vec_model = Word2Vec(sentences=bbc_data['data'], vector_size=100, window=5, min_count=1, workers=4)


In [70]:
joblib.dump(word2vec_model,"w2v.pkl")

['w2v.pkl']

In [43]:
def word_vectorizer(doc, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    num_words = 0
    for word in doc:
        if word in model.wv:
            num_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    if num_words > 0:
        feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

# Apply word vectorization to each document
bbc_data['vectors'] = bbc_data['data'].apply(lambda x: word_vectorizer(x, word2vec_model, 100))

# Example to check the vector for the first document
print(bbc_data['vectors'].head(1))


0    [0.12256169, 0.24330522, 0.03752677, 0.2188870...
Name: vectors, dtype: object


In [44]:
# Convert list of vectors to a NumPy array
X = np.array(list(bbc_data['vectors']))
y = bbc_data['labels']

In [45]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [57]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


# Define parameter grid
param_grid = {
    'linearsvc__C': [0.1, 1, 10, 100],
    'linearsvc__max_iter': [1000, 2000, 5000]
}

# Create a pipeline with scaling and SVC
pipeline = make_pipeline(StandardScaler(), LinearSVC())

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

# Fit grid search
grid_search.fit(x_train, y_train)

# Get best parameters and model
print("Best Parameters: ", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(x_test)
print("Test Accuracy: ", accuracy_score(y_test, y_pred))


Best Parameters:  {'linearsvc__C': 10, 'linearsvc__max_iter': 1000}
Test Accuracy:  0.9236526946107785


In [69]:
joblib.dump(best_model,"svm-model.pkl")

['svm-model.pkl']

In [63]:
test="The world of entertainment was abuzz this week as Hollywood premiered its latest blockbuster, a high-octane superhero film that has already broken several box office records. The movie, directed by a renowned filmmaker known for his previous hits, features an all-star cast that includes some of the biggest names in the industry. Critics are praising the film for its groundbreaking special effects and engaging storyline. Fans are flocking to theaters in droves, eager to experience the action-packed spectacle on the big screen. Additionally, the film’s soundtrack, featuring tracks from popular artists, has also topped the music charts, further solidifying its place in popular culture."
t_v=preprocess_text(test)
v=word_vectorizer(t_v, word2vec_model, 100).reshape(1,-1)
y = best_model.predict(v)[0]
y

'entertainment'

In [59]:
bbc_data["labels"].unique()

array(['entertainment', 'business', 'sport', 'politics', 'tech'],
      dtype=object)

In [65]:
import streamlit as st

In [68]:
test = st.text_input("Enter a String")


2024-07-19 15:36:29.978 
  command:

    streamlit run d:\codes\env\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
