In [1]:
import pandas as pd

df = pd.read_csv("data/imdb.csv")
df = df[~df.text.isna()]

### classification


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd

max_features = 10000

# Splitting the data into train and test sets
X = df['text']
y = df['target']

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Initialize TF-IDF vectorizer
count_vectorizer = CountVectorizer(
    min_df=1, max_features=max_features)
count_vectorizer = count_vectorizer.fit(X)

# Fit and transform the text data
X_train = count_vectorizer.transform(X_train_text)
X_test = count_vectorizer.transform(X_test_text)

In [3]:
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Initialize classifiers
rf_classifier = RandomForestClassifier(verbose=True)
svc_classifier = LinearSVC()
svc_classifier = CalibratedClassifierCV(svc_classifier, method='sigmoid')
gb_classifier = GradientBoostingClassifier()
classifiers = {
    'Random Forest': rf_classifier,
    'SVC': svc_classifier,
    'Gradient Boosting': gb_classifier
}


results = {'Classifier': [], 'Accuracy': [],
           'Precision': [], 'Recall': [], 'F1': []}

# Train and evaluate each classifier
for clf_name, clf in tqdm(classifiers.items()):
    print(clf_name, clf)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results['Classifier'].append(clf_name)
    results['Accuracy'].append(accuracy)
    results['Precision'].append(precision)
    results['Recall'].append(recall)
    results['F1'].append(f1)

# Create DataFrame to display results

results_df = pd.DataFrame(results)
results_df.to_csv(f'models/accuracies_{max_features}.csv')
results_df

  0%|          | 0/3 [00:00<?, ?it/s]

Random Forest RandomForestClassifier(verbose=True)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   55.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s


SVC CalibratedClassifierCV(estimator=LinearSVC())


 67%|██████▋   | 2/3 [02:12<00:57, 57.84s/it] 

Gradient Boosting GradientBoostingClassifier()


100%|██████████| 3/3 [02:46<00:00, 55.64s/it]


Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1
0,Random Forest,0.8495,0.855284,0.845302,0.850264
1,SVC,0.8603,0.855698,0.870425,0.862999
2,Gradient Boosting,0.8044,0.779038,0.855786,0.815611


In [4]:
import pickle
import joblib

model_params_filename = 'models/predictions/svc_imdb_' + \
    str(max_features) + '.sav'
pickle.dump(svc_classifier, open(model_params_filename, 'wb'))
joblib.dump(count_vectorizer, 'models/vectorizer/vectorizer_imdb_' +
            str(max_features) + '.pkl')

['models/vectorizer/vectorizer_imdb_10000.pkl']