In [1]:
import os
import numpy as np
import pickle
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk import word_tokenize
from string import punctuation
from sklearn.utils import shuffle
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [2]:
translator = str.maketrans('', '', punctuation)
stop_words = set(stopwords.words('english'))
stop_words.remove('not')

In [3]:
ignore_words = {'film', 'movie', 'one', 'even', 'would', 'time', 'get', 'story', 'could',
    'plot', 'make', 'see', 'also', 'way', 'little', 'well', 'people', 'never',
    'know', 'two', 'another', 'big', 'made', 'go', 'back', 'around', 'going',
    'think', 'still', 'characters', 'first', 'character', 'scene', 'scenes',
    'films', 'movies', 'man', 'new', 'may', 'take', 'almost', 'every', 'things',
    'real', 'comes', 'come', 'fact', 'last', 'point', 'plays', 'played', 'role',
    'years', 'john', 'audience', 'us'}

In [4]:
negative_documents = []
max_len_negative = 0
for file in os.listdir('../data/raw/neg'):
    with open ('../data/raw/neg/' + file) as f:
        text = f.read()
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [t.translate(translator) for t in tokens]
        tokens = [t for t in tokens if t]
        tokens = [t for t in tokens if t not in stop_words and t not in ignore_words]
        if len(tokens) > max_len_negative:
            max_len_negative = len(tokens)
        negative_documents.append(' '.join(tokens))

In [5]:
positive_documents = []
max_len_positive= 0
for file in os.listdir('../data/raw/pos'):
    with open ('../data/raw/pos/' + file) as f:
        text = f.read()
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [t.translate(translator) for t in tokens]
        tokens = [t for t in tokens if t]
        tokens = [t for t in tokens if t not in stop_words and t not in ignore_words]
        if len(tokens) > max_len_positive:
            max_len_positive = len(tokens)
        positive_documents.append(' '.join(tokens))

In [21]:
min(max_len_positive, max_len_negative)

879

In [6]:
from collections import Counter

neg_tokens = [t for doc in negative_documents for t in doc.split()]
neg_counter = Counter(neg_tokens)
most_common_neg = neg_counter.most_common(100)
print("neg:", most_common_neg)

pos_tokens = [t for doc in positive_documents for t in doc.split()]
pos_counter = Counter(pos_tokens)
most_common_pos = pos_counter.most_common(100)
print("pos:", most_common_pos)

neg: [('nt', 3442), ('not', 2694), ('like', 1836), ('good', 1128), ('bad', 1021), ('much', 998), ('really', 787), ('action', 606), ('director', 595), ('better', 530), ('end', 527), ('something', 525), ('seems', 502), ('work', 494), ('best', 493), ('nothing', 493), ('life', 492), ('many', 489), ('enough', 481), ('script', 477), ('thing', 449), ('love', 448), ('funny', 444), ('gets', 441), ('look', 436), ('actually', 436), ('makes', 431), ('though', 430), ('however', 422), ('comedy', 412), ('least', 411), ('say', 400), ('great', 396), ('minutes', 383), ('since', 381), ('long', 375), ('acting', 369), ('ca', 368), ('actors', 366), ('seen', 366), ('guy', 364), ('world', 361), ('find', 360), ('cast', 360), ('old', 349), ('original', 348), ('might', 343), ('show', 342), ('right', 341), ('ever', 338), ('goes', 337), ('anything', 337), ('performance', 336), ('course', 333), ('part', 332), ('interesting', 332), ('lot', 328), ('year', 326), ('effects', 324), ('trying', 315), ('give', 313), ('away

In [7]:
negative_documents = shuffle(negative_documents, random_state=42)
positive_documents = shuffle(positive_documents, random_state=42)

In [8]:
BASE_DIR = Path('../data/processed')
for split in ['train', 'val', 'test']:
    (BASE_DIR / split).mkdir(parents=True, exist_ok=True)

In [9]:
X_train = pd.DataFrame(negative_documents[:700] + positive_documents[:700])
y_train = pd.Series([0 for _ in range(700)] + [1 for _ in range(700)])
X_train, y_train = shuffle(X_train, y_train, random_state=42)
X_train.to_parquet('../data/processed/train/X.parquet')
y_train.to_frame().to_parquet('../data/processed/train/y.parquet')

In [10]:
X_val = pd.DataFrame(negative_documents[700:850] + positive_documents[700:850])
y_val = pd.Series([0 for _ in range(150)] + [1 for _ in range(150)])
X_val, y_val = shuffle(X_val, y_val, random_state=42)
X_val.to_parquet('../data/processed/val/X.parquet')
y_val.to_frame().to_parquet('../data/processed/val/y.parquet')

In [11]:
X_test = pd.DataFrame(negative_documents[850:] + positive_documents[850:])
y_test = pd.Series([0 for _ in range(150)] + [1 for _ in range(150)])
X_test, y_test = shuffle(X_test, y_test, random_state=42)
X_test.to_parquet('../data/processed/test/X.parquet')
y_test.to_frame().to_parquet('../data/processed/test/y.parquet')

In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    min_df=2
)

X_train_vec = vectorizer.fit_transform(X_train[0])
X_val_vec = vectorizer.transform(X_val[0])

In [103]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_vec, y_train)
y_pred = logreg.predict(X_val_vec)
print(accuracy_score(y_val, y_pred))
print(f1_score(y_val, y_pred))

0.8366666666666667
0.839344262295082


In [104]:
from sklearn.svm import LinearSVC
svm = LinearSVC(random_state=42)
svm.fit(X_train_vec, y_train)
y_pred = svm.predict(X_val_vec)
print(accuracy_score(y_val, y_pred))
print(f1_score(y_val, y_pred))

0.8333333333333334
0.8366013071895425


In [105]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
y_pred = nb.predict(X_val_vec)
print(accuracy_score(y_val, y_pred))
print(f1_score(y_val, y_pred))

0.8033333333333333
0.8039867109634552


In [106]:
X_test_vec = vectorizer.transform(X_test[0])
best_model = svm 
test_preds = best_model.predict(X_test_vec)
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

           0       0.86      0.83      0.85       150
           1       0.84      0.87      0.85       150

    accuracy                           0.85       300
   macro avg       0.85      0.85      0.85       300
weighted avg       0.85      0.85      0.85       300



In [107]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt.fit(X_train_vec, y_train)
y_pred = dt.predict(X_val_vec)
print(accuracy_score(y_val, y_pred))
print(f1_score(y_val, y_pred))

0.63
0.607773851590106


In [108]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
rf.fit(X_train_vec, y_train)
y_pred = rf.predict(X_val_vec)
print(accuracy_score(y_val, y_pred))
print(f1_score(y_val, y_pred))

0.8
0.7902097902097902


In [109]:
X_test_vec = vectorizer.transform(X_test[0])
test_preds = rf.predict(X_test_vec)
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

           0       0.79      0.86      0.82       150
           1       0.85      0.77      0.81       150

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300



In [78]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LinearSVC())
])

param_grid = {
    "tfidf__max_features": [2000, 5000, 10000, 15000, 20000],
    "tfidf__ngram_range": [(1,1), (1,2), (1,3)],
    "tfidf__min_df": [2, 5, 10],
}


grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train[0], y_train)

print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best params: {'tfidf__max_features': 20000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 1)}
Best CV score: 0.8493085219206964


In [112]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer (
    max_features=10000,
    ngram_range=(1,1),
    min_df=5
)

X_train_vec = vectorizer.fit_transform(X_train[0])
X_val_vec = vectorizer.transform(X_val[0])

In [113]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt.fit(X_train_vec, y_train)
y_pred = dt.predict(X_val_vec)
print(accuracy_score(y_val, y_pred))
print(f1_score(y_val, y_pred))

0.6233333333333333
0.6062717770034843


In [114]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
rf.fit(X_train_vec, y_train)
y_pred = rf.predict(X_val_vec)
print(accuracy_score(y_val, y_pred))
print(f1_score(y_val, y_pred))

0.85
0.8464163822525598


In [115]:
X_test_vec = vectorizer.transform(X_test[0])
test_preds = rf.predict(X_test_vec)
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

           0       0.80      0.85      0.82       150
           1       0.84      0.79      0.82       150

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300

