In [49]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [50]:
# Sample dataset
df=pd.read_csv('./IMDB Dataset.csv')
df=df.sample(10000)

In [51]:
df['sentiment']=df['sentiment'].replace({'positive':1,'negative':0})

In [52]:
X=df['review']
y=df['sentiment']

In [53]:
# Define the custom preprocessing functions
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def convert_lower(text):
    return text.lower()

def remove_special(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

def stem_words(text):
    ps = PorterStemmer()
    return ' '.join([ps.stem(word) for word in text.split()])

In [54]:
# Create a pipeline of the preprocessing steps using FunctionTransformer
preprocessing = Pipeline(steps=[
    ('html_clean', FunctionTransformer(lambda x: [clean_html(text) for text in x], validate=False)),
    ('lowercase', FunctionTransformer(lambda x: [convert_lower(text) for text in x], validate=False)),
    ('remove_special_chars', FunctionTransformer(lambda x: [remove_special(text) for text in x], validate=False)),
    ('remove_stopwords', FunctionTransformer(lambda x: [remove_stopwords(text) for text in x], validate=False)),
    ('stemming', FunctionTransformer(lambda x: [stem_words(text) for text in x], validate=False))
])

In [55]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [56]:
# Combine the preprocessing with vectorization and SVM
svm_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('vectorizer', CountVectorizer()),  # Convert cleaned text to vectors
    ('svm', SVC())                      # SVM model
])

In [62]:
# Define the hyperparameter grid for tuning SVM
param_grid = {
    'svm__C': [0.1, 1, 10, 100],        # Regularization parameter
    'svm__kernel': ['linear', 'rbf'],   # Kernel type
    'svm__gamma': ['scale'],    # Kernel coefficient for 'rbf' kernel
    'svm__degree': [2, 3, 4]            # Degree of the polynomial kernel (only used if kernel='poly')
}

In [63]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, verbose=1, n_jobs=1)

In [None]:
# Train the model with the best hyperparameters
 # Example target labels (positive, negative)

# Fit the grid search
grid_search.fit(X, y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [None]:
# Output the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Predict using the best model
best_model = grid_search.best_estimator_
predictions = best_model.predict(X)
print("Predictions:", predictions)