In [106]:
import pandas as pd
import numpy as np
import regex as re
import string
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, FunctionTransformer, StandardScaler, LabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.pipeline import make_pipeline


# Data Loading:

In [107]:
movies_df = pd.read_csv('data/movies.csv')
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [108]:
movies_df.drop_duplicates(subset='movieid', inplace=True)
train_df.drop_duplicates(subset=['movieid', 'reviewerName', 'reviewText'], inplace=True)

In [109]:
train_df.dropna(subset='reviewText', inplace=True)
train_df.reset_index(drop=True, inplace=True)

In [110]:
movies_df.reset_index(drop=True, inplace=True)

# Merge Metadata:

In [111]:
data = pd.merge(train_df, movies_df, on='movieid', how='left') 
test_df = pd.merge(test_df, movies_df, on='movieid', how='left')

In [112]:
tgt = data['sentiment']
data = data[[ 'reviewText', 'audienceScore']]

In [113]:
test_df = test_df[[ 'reviewText']]

In [114]:
lb = LabelBinarizer()
tgt = lb.fit_transform(tgt)
tgt = tgt.ravel()


In [115]:
class TextProcessor(BaseEstimator, TransformerMixin):
    stop_words = set(ENGLISH_STOP_WORDS)
    stop_words.update(['film', 'films', 'movie', 'movies', 'director', 'plot', 'story', 'actor', 'actors', 'cast',
                       'scene', 'scenes', 'cinema', 'hollywood', 'script', 'screenplay', 'character', 'charaters', 'role',
                       'roles', 'actress', 'actresses', 'genre', 'genres', 'sequel', 'prequel', 'remake', 'remakes', 'original',
                       'version', 'versions', 'franchise', 'franchises', 'cinematography', 'cinematographer', 'cinematographers',
                       'cinematographic', 'cinematographics', 'cinematic', 'cinematics', 'cinematograph', 'cinematographs', 
                       'cinematographical', 'cinematographically', 'like', 'just', 'review', 'reviews', 'storyline', 'storylines',
                       'story', 'stories', 'plotline', 'dont', 'didnt', 'doesnt', 'cant', 'couldnt', 'wouldnt', 'shouldnt', 'wont',
                       'just', 'isnt', 'arent', 'wasnt', 'werent', 'havent', 'hasnt', 'hadnt', 'having', 'got', 'get',
                       'gets', 'gotten', 'im', 'ive', 'id', 'ill', 'youre', 'youve', 'youll', 'youd', 'theyre', 'theyve',
                       'theyll', 'theyd', 'weve', 'wed', 'were', 'wasnt'
                      ])
    

    process_pattern = re.compile(r"\s|[.,?!;:-_]")
    whitespace_pattern = re.compile(r"\s+")
    
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X: pd.Series, y=None, stem=False):
        X = X.copy()
        X.fillna("", inplace=True)
        X = X.apply(self.process_text)
        return X
    
    def process_text(self, text):
        text = text.lower()

        words_without_apostrophe = []
        for word in text.split():
            if "'" not in word:
                words_without_apostrophe.append(word)
        text = " ".join(words_without_apostrophe)

        text_characters = []
        for character in text:
            if character.isalpha() or character.isspace():
                text_characters.append(character)
        text = "".join(text_characters)

        words = []
        for word in text.split():
            if len(word) > 2 and word not in self.stop_words:
                words.append(word)
        text = " ".join(words)

        whitespace_removed = []
        words = text.split()
        for word in words:
            if word:
                whitespace_removed.append(word)
        text = " ".join(whitespace_removed)

        return text


In [116]:
text_pipeline = Pipeline([
    ('text_processor', TextProcessor()),         
    ('tfidf_vectorizer', TfidfVectorizer(stop_words='english'))
])

In [117]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())   
])

In [118]:
preprocessor = ColumnTransformer([
    ('numeric', numeric_pipeline, ['audienceScore']),
    ('reviewText', text_pipeline, 'reviewText'),
])

# Feature Engineering:

In [119]:
X_train, X_test, y_train, y_test = train_test_split(data, tgt, test_size=0.2, shuffle=True, stratify=tgt, random_state=42)

In [120]:
from sklearn.feature_selection import SelectKBest, SelectPercentile

skb = SelectKBest(k=1000)
k_percentile = SelectPercentile(percentile=25)

In [121]:
from sklearn.feature_selection import SelectFromModel, mutual_info_classif

logreg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('k_percentile', k_percentile),
    ('logreg', LogisticRegression(max_iter=100000))
])

logreg_pipeline.fit(X_train, y_train)

accuracy = logreg_pipeline.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")



Accuracy: 0.81


# CART

In [122]:
# param_grid = {
# #     'k_percentile__percentile': [10, 20, 30, 40, 50],
# #     'logreg__C': [0.1, 1, 10, 100, 1000],
# #     'logreg__penalty': ['l2'],
# #     'logreg__solver': ['liblinear', 'lbfgs', 'sag', 'saga', 'newton-cg'],
#     'logreg__max_iter': [1000],
# #     'k_percentile__percentile': [10],
#     'logreg__C': [ 1],
#     'logreg__penalty': ['l2'],
#     'logreg__solver': ['liblinear'],
# }

In [123]:
# grid_search_lr = GridSearchCV(
#     estimator=logreg_pipeline,
#     param_grid=param_grid,
#     cv=5,
#     scoring='accuracy',
#     n_jobs=-1,
#     verbose = 3
# )

# # Fit the grid search to the training data
# grid_search_lr.fit(X_train, y_train)



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 4/5] END logreg__C=1, logreg__max_iter=1000, logreg__penalty=l2, logreg__solver=liblinear;, score=0.801 total time=   2.8s
[CV 1/5] END logreg__C=1, logreg__max_iter=1000, logreg__penalty=l2, logreg__solver=liblinear;, score=0.801 total time=   2.9s
[CV 2/5] END logreg__C=1, logreg__max_iter=1000, logreg__penalty=l2, logreg__solver=liblinear;, score=0.801 total time=   2.9s
[CV 3/5] END logreg__C=1, logreg__max_iter=1000, logreg__penalty=l2, logreg__solver=liblinear;, score=0.802 total time=   2.9s
[CV 5/5] END logreg__C=1, logreg__max_iter=1000, logreg__penalty=l2, logreg__solver=liblinear;, score=0.802 total time=   2.9s


In [125]:
import pickle

with open('/Users/arshi/Desktop/movie-recommendation-booking-app/movie-sentiment/model_config/logreg_pipeline.pkl', 'wb') as f:
    pickle.dump(logreg_pipeline, f)



In [127]:
with open('/Users/arshi/Desktop/movie-recommendation-booking-app/movie-sentiment/model_config/logreg_pipeline.pkl', 'rb') as f:
    loaded_pipeline = pickle.load(f)


In [129]:
import pandas as pd

# Sample data
data = {
    'reviewText': [
        'The movie was fantastic! Loved the acting and the plot.',
        'An average film with a predictable storyline.',
        'Absolutely horrible experience. The worst movie I have ever seen.',
        'A brilliant film with exceptional performances by the cast.',
        'It was okay, but not as good as I expected.'
    ],
    'audienceScore': [9, 5, 2, 10, 6]
}

# Create the DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)



                                          reviewText  audienceScore
0  The movie was fantastic! Loved the acting and ...              9
1      An average film with a predictable storyline.              5
2  Absolutely horrible experience. The worst movi...              2
3  A brilliant film with exceptional performances...             10
4        It was okay, but not as good as I expected.              6


In [130]:
loaded_pipeline.predict(df)

array([1, 0, 0, 1, 0])