In [27]:
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 

In [29]:
def sentiment_analysis(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)

    # Data Understanding
    def data_understanding(df):
        print("Dataset Information:")
        print(df.info())

        print("\nDataset Description:")
        print(df.describe())

        # Check for missing values
        print("\nMissing Values:")
        print(df.isnull().sum())

    data_understanding(df)

    # Data Preprocessing
    def data_preprocessing(df):
        # Drop rows with missing values (if any)
        df.dropna(inplace=True)

        # Convert review_rating to numerical values using str.extract
        df['review_rating'] = df['review_rating'].str.extract('(\\d+)').astype(int)

        # Create a new column 'sentiment' with positive (1) for ratings >= 4 and negative (0) for ratings < 4
        df['sentiment'] = df['review_rating'].apply(lambda x: 1 if x >= 4 else 0)

        # Pre-process the data using CountVectorizer
        vectorizer = CountVectorizer(stop_words='english')
        X = vectorizer.fit_transform(df['review_text'])
        y = df['sentiment']

        return X, y

    X, y = data_preprocessing(df)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define a function to perform grid search and evaluate models
    def evaluate_model(model, param_grid):
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        print(f"Best Parameters: {grid_search.best_params_}")
        print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
        print(f"Accuracy: {accuracy_score(y_test, y_pred)}\n")
        return best_model

    # SVM Model
    def svm_model():
        print("SVM Model:")
        svm_param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
        return evaluate_model(svm.SVC(), svm_param_grid)

    # Decision Tree Model
    def dt_model():
        print("Decision Tree Model:")
        dt_param_grid = {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
        return evaluate_model(DecisionTreeClassifier(), dt_param_grid)

    # Logistic Regression Model
    def lr_model():
        print("Logistic Regression Model:")
        lr_param_grid = {'C': [0.1, 1, 10]}
        return evaluate_model(LogisticRegression(), lr_param_grid)

    # Random Forest Classifier Model
    def rf_model():
        print("Random Forest Classifier Model:")
        rf_param_grid = {
            'n_estimators': [50, 100, 200],
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 10, 20],
            'max_leaf_nodes': [None, 10, 20]
        }
        return evaluate_model(RandomForestClassifier(), rf_param_grid)

    # Train and evaluate each model
    svm_best_model = svm_model()
    dt_best_model = dt_model()
    lr_best_model = lr_model()
    rf_best_model = rf_model()

    # Compare the performance of each model
    models = ['SVM', 'Decision Tree', 'Logistic Regression', 'Random Forest']
    accuracies = [
        accuracy_score(y_test, svm_best_model.predict(X_test)),
        accuracy_score(y_test, dt_best_model.predict(X_test)),
        accuracy_score(y_test, lr_best_model.predict(X_test)),
        accuracy_score(y_test, rf_best_model.predict(X_test))
    ]

    comparison_df = pd.DataFrame({'Model': models, 'Accuracy': accuracies})
    print("\nModel Comparison:")
    print(comparison_df)

# Call the function with the path to your dataset
sentiment_analysis('archives/kaggle.csv')

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 820 entries, 0 to 819
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     820 non-null    int64 
 1   review_title   820 non-null    object
 2   review_text    819 non-null    object
 3   review_rating  820 non-null    object
dtypes: int64(1), object(3)
memory usage: 25.8+ KB
None

Dataset Description:
       Unnamed: 0
count  820.000000
mean   409.500000
std    236.857904
min      0.000000
25%    204.750000
50%    409.500000
75%    614.250000
max    819.000000

Missing Values:
Unnamed: 0       0
review_title     0
review_text      1
review_rating    0
dtype: int64
SVM Model:
Best Parameters: {'C': 1, 'kernel': 'linear'}
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.52      0.62        50
           1       0.82      0.93      0.87       114

    accuracy               