# Modeling

## Vectorize the data

Now that we have a dataframe with 3 columns :  `tokenized` (title + synopsis), `genre` and `length`, we can vectorize the data. We will use the `Bag-of-Words` method.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import sys
sys.path.append('../preprocessing')

from preprocess import *

We start by importing the libraries and getting the preprocessed data.

In [30]:

df = pd.read_csv('../preprocessing/preprocessed_data.csv')
df_test = pd.read_csv('../preprocessing/preprocessed_data_test.csv')
df_2 = pd.read_csv('../data/allocine_genres_train.csv')
df_2_test = pd.read_csv('../data/allocine_genres_test.csv')

# keep only the columns we need
df_2 = df_2[['synopsis', 'genre','titre']]
df_2_test = df_2_test[['synopsis','genre', 'titre']]

df_2.head()

Unnamed: 0,synopsis,genre,titre
0,"En visite à Istanbul , le célèbre détective be...",policier,Le Crime de l' Orient - Express
1,Un jeune homme d' origine modeste est accusé d...,drame,12 hommes en colère
2,"Lorsque Marie-Laure , mère de quatre jeunes en...",drame,Après moi le bonheur
3,Un vagabond s’ éprend d’ une belle et jeune ve...,romance,Les Lumières de la ville
4,"L' histoire vraie de Carl Brashear , premier A...",biopic,Les Chemins de la dignité


In [46]:
# initialize the CountVectorizer
vectorizer = CountVectorizer()

# fit the vectorizer on the text
vectorizer.fit(df_2['synopsis'])

# get all the unique words
print(len(vectorizer.get_feature_names_out()))

# transform train and test data into vectors
X_train = vectorizer.transform(df_2['synopsis'])
X_test = vectorizer.transform(df_2_test['synopsis'])

# initialize the LabelEncoder
le = LabelEncoder()

# fit the encoder on the train labels
le.fit(df_2['genre'])

# transform the train and test labels
y_train = le.transform(df_2['genre'])

# split the train data into train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

# print the shapes
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# Path: model\model.ipynb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# initialize the model
model = LogisticRegression()

# fit the model on the train data
model.fit(X_train, y_train) 

# make predictions on the validation data
preds = model.predict(X_val)

# print the accuracy score
print("LogisticRegression: ", accuracy_score(y_val, preds))

# test the model on the test data
preds_test = model.predict(X_test)

# convert the predictions to text
preds_test = le.inverse_transform(preds_test)

# print the accuracy score
# print(df_2_test['genre'])
# print(preds_test)
print("LogisticRegression Test: ", accuracy_score(df_2_test['genre'], preds_test))
print("--------------------")

# test other models
from sklearn.ensemble import RandomForestClassifier

# initialize the model
model = RandomForestClassifier()

# fit the model on the train data
model.fit(X_train, y_train)

# make predictions on the validation data
preds = model.predict(X_val)

# print the accuracy score
print("RandomForestClassifier: ", accuracy_score(y_val, preds))

# test the model on the test data
preds_test = model.predict(X_test)

# convert the predictions to text
preds_test = le.inverse_transform(preds_test)

# print the accuracy score
print("RandomForestClassifier Test: ", accuracy_score(df_2_test['genre'], preds_test))
print("--------------------")


# test other models
from sklearn.svm import SVC

# initialize the model
model = SVC()

# fit the model on the train data
model.fit(X_train, y_train)

# make predictions on the validation data
preds = model.predict(X_val)

# print the accuracy score
print("SVC: ", accuracy_score(y_val, preds))

# test the model on the test data
preds_test = model.predict(X_test)

# convert the predictions to text
preds_test = le.inverse_transform(preds_test)

# print the accuracy score
print("SVC Test: ", accuracy_score(df_2_test['genre'], preds_test))
print("--------------------")

# test other models
from sklearn.naive_bayes import MultinomialNB

# initialize the model
model = MultinomialNB()

# fit the model on the train data
model.fit(X_train, y_train)

# make predictions on the validation data
preds = model.predict(X_val)

# print the accuracy score
print("MultinomialNB: ", accuracy_score(y_val, preds))

# test the model on the test data
preds_test = model.predict(X_test)

# convert the predictions to text
preds_test = le.inverse_transform(preds_test)

# print the accuracy score
print("MultinomialNB Test: ", accuracy_score(df_2_test['genre'], preds_test))
print("--------------------")

# test other models
from sklearn.neighbors import KNeighborsClassifier

# initialize the model
model = KNeighborsClassifier()

# fit the model on the train data
model.fit(X_train, y_train)

# make predictions on the validation data
preds = model.predict(X_val)

# print the accuracy score
print("KNeighborsClassifier: ", accuracy_score(y_val, preds))

# test the model on the test data
preds_test = model.predict(X_test)

# convert the predictions to text
preds_test = le.inverse_transform(preds_test)

# print the accuracy score
print("KNeighborsClassifier Test: ", accuracy_score(df_2_test['genre'], preds_test))
print("--------------------")

# test other models
from sklearn.tree import DecisionTreeClassifier

# initialize the model
model = DecisionTreeClassifier()

# fit the model on the train data
model.fit(X_train, y_train)

# make predictions on the validation data
preds = model.predict(X_val)

# print the accuracy score
print("DecisionTreeClassifier: ", accuracy_score(y_val, preds))

# test the model on the test data
preds_test = model.predict(X_test)

# convert the predictions to text
preds_test = le.inverse_transform(preds_test)

# print the accuracy score
print("DecisionTreeClassifier Test: ", accuracy_score(df_2_test['genre'], preds_test))
print("--------------------")

# test other models
# use xgboost
from xgboost import XGBClassifier

# initialize the model
model = XGBClassifier()

# fit the model on the train data
model.fit(X_train, y_train)

# make predictions on the validation data
preds = model.predict(X_val)

# print the accuracy score
print("XGBClassifier: ", accuracy_score(y_val, preds))

# test the model on the test data
preds_test = model.predict(X_test)

# convert the predictions to text
preds_test = le.inverse_transform(preds_test)

# print the accuracy score
print("XGBClassifier Test: ", accuracy_score(df_2_test['genre'], preds_test))
print("--------------------")






22608
(2300, 22608) (575, 22608) (2300,) (575,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression:  0.43130434782608695
LogisticRegression Test:  0.4909596662030598
--------------------
RandomForestClassifier:  0.36695652173913046
RandomForestClassifier Test:  0.4102920723226704
--------------------
SVC:  0.3130434782608696
SVC Test:  0.35465924895688455
--------------------
MultinomialNB:  0.4295652173913043
MultinomialNB Test:  0.42141863699582754
--------------------
KNeighborsClassifier:  0.16521739130434782
KNeighborsClassifier Test:  0.1835883171070932
--------------------
DecisionTreeClassifier:  0.27304347826086955
DecisionTreeClassifier Test:  0.30737134909596664
--------------------
XGBClassifier:  0.40869565217391307
XGBClassifier Test:  0.4617524339360223
--------------------


We then create a `CountVectorizer` object and fit it to the `tokenized` column of the dataframe. This will create a vocabulary of all the unique words in the `tokenized` column. We then use the `transform()` method to convert the text data into a matrix of token counts. This matrix is a sparse matrix, which means that it contains a lot of zeros. We can convert this matrix to a dense matrix using the `toarray()` method.

In [4]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the tokenized column
bow = vectorizer.fit_transform(df['tokenized'])

# Convert the sparse matrix to a DataFrame
bow_df = pd.DataFrame(bow.toarray(), columns=vectorizer.get_feature_names_out())

# Concatenate the BoW DataFrame with the original DataFrame
df_bow = pd.concat([df, bow_df], axis=1)

# Initialize a LabelEncoder object
label_encoder = LabelEncoder()

# Fit and transform the genre column
df_bow['genre_encoded'] = label_encoder.fit_transform(df['genre'])

df_bow.head()




Unnamed: 0,tokenized,genre,length,10e,1er,25em,2e,3e,3em,4h44,...,évoluent,évoqu,évoquent,ête,être,île,œil,œuf,œuvr,genre_encoded
0,"['visit', 'célebr', 'détect', 'belg', 'embarqu...",policier,64,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
1,"['homm', 'coler', 'jeun', 'homm', 'd', 'origin...",drame,58,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,3
2,"['apres', 'bonheur', 'lorsqu', 'mer', 'quatr',...",drame,71,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
3,"['vagabond', 'éprend', 'bel', 'jeun', 'vendeux...",romance,24,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
4,"['histoir', 'vrai', 'premi', 'afro', 'américai...",biopic,35,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We then split the data into training and testing sets. We will use 80% of the data for training and 20% for testing. We will also set the `random_state` parameter to 42 so that we can reproduce the results.

In [5]:
X_train, X_val, y_train, y_val = train_test_split(df_bow.drop(['genre', 'tokenized', 'genre_encoded'], axis=1), 
                                                  df_bow['genre_encoded'], test_size=0.2, random_state=42)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(2300, 9727) (575, 9727) (2300,) (575,)


We'll first start with a basic naive model. We will just see what class is the most frequent and predict that class for all the test data. We will then use the `accuracy_score` function to calculate the accuracy of the model.

In [6]:
from sklearn.dummy import DummyClassifier

# Create dummy classifier that always predicts the most frequent class
dummy_clf = DummyClassifier(strategy='most_frequent')

# Fit the dummy classifier on the training data
dummy_clf.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_dummy = dummy_clf.predict(X_val)

# Evaluate the performance of the dummy classifier
from sklearn.metrics import accuracy_score
dummy_accuracy = accuracy_score(y_val, y_pred_dummy)
print("Accuracy of dummy classifier:", dummy_accuracy)


Accuracy of dummy classifier: 0.1565217391304348


## Testing multiple models

In [7]:
# Naive Bayes   
from sklearn.naive_bayes import MultinomialNB

# Create a MultinomialNB object
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(X_train, y_train)

# Compute the accuracy of the classifier
nb_accuracy = nb_classifier.score(X_val, y_val)
print("Accuracy of Naive Bayes classifier:", nb_accuracy)



Accuracy of Naive Bayes classifier: 0.4747826086956522


In [12]:
#  SVM
from sklearn.svm import SVC

# Create a SVC object
svc_classifier = SVC()

# Fit the classifier to the training data
svc_classifier.fit(X_train, y_train)

# Compute the accuracy of the classifier
svc_accuracy = svc_classifier.score(X_val, y_val)
print("Accuracy of SVM classifier:", svc_accuracy)


Accuracy of SVM classifier: 0.18956521739130436


In [13]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Create a LogisticRegression object
logreg = LogisticRegression(max_iter=1000)

# Fit the classifier to the training data
logreg.fit(X_train, y_train)

# Compute the accuracy of the classifier
logreg_accuracy = logreg.score(X_val, y_val)
print("Accuracy of Logistic Regression classifier:", logreg_accuracy)


Accuracy of Logistic Regression classifier: 0.49043478260869566


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

# Create a RandomForestClassifier object
rf_classifier = RandomForestClassifier()

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Compute the accuracy of the classifier
rf_accuracy = rf_classifier.score(X_val, y_val)
print("Accuracy of Random Forest classifier:", rf_accuracy)

Accuracy of Random Forest classifier: 0.42782608695652175


In [17]:
# XGBoost

from xgboost import XGBClassifier

# Create a XGBClassifier object
xgb_classifier = XGBClassifier()

# Fit the classifier to the training data
xgb_classifier.fit(X_train, y_train)

# Compute the accuracy of the classifier
xgb_accuracy = xgb_classifier.score(X_val, y_val)
print("Accuracy of XGBoost classifier:", xgb_accuracy)


Accuracy of XGBoost classifier: 0.4539130434782609


In [19]:
dummy_grid = {
    'strategy': ['most_frequent', 'stratified', 'prior', 'uniform', 'constant']
}
dummy = DummyClassifier( random_state = 42)
from sklearn.model_selection import GridSearchCV
dummy_gs = GridSearchCV(dummy, param_grid = dummy_grid, cv = 5)
dummy_gs.fit(X_train, y_train)
print(dummy_gs.best_score_)

0.17869565217391306


5 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\elmah\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\elmah\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\dummy.py", line 196, in fit
    raise ValueError(
ValueError: Constant target value has to be specified when the constant strategy is used.



In [25]:
# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the tokenized column
tfidf = tfidf_vectorizer.fit_transform(df['tokenized'])

# Convert the tfidf matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the tfidf DataFrame with the original DataFrame
df_tfidf = pd.concat([df, tfidf_df], axis=1)

# Initialize a LabelEncoder object
label_encoder = LabelEncoder()

# Fit and transform the genre column
df_tfidf['genre_encoded'] = label_encoder.fit_transform(df['genre'])

df_tfidf.head()

X_train, X_val, y_train, y_val = train_test_split(df_tfidf.drop(['genre', 'tokenized', 'genre_encoded'], axis=1),
                                                    df_tfidf['genre_encoded'], test_size=0.2, random_state=42)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# Create a MultinomialNB object
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(X_train, y_train)

# Compute the accuracy of the classifier
nb_accuracy = nb_classifier.score(X_val, y_val)
print("Accuracy of Naive Bayes classifier:", nb_accuracy)

# Create a SVC object
svc_classifier = SVC()

# Fit the classifier to the training data
svc_classifier.fit(X_train, y_train)

# Compute the accuracy of the classifier
svc_accuracy = svc_classifier.score(X_val, y_val)

print("Accuracy of SVM classifier:", svc_accuracy)

# Create a LogisticRegression object
logreg = LogisticRegression(max_iter=1000)

# Fit the classifier to the training data
logreg.fit(X_train, y_train)

# Compute the accuracy of the classifier
logreg_accuracy = logreg.score(X_val, y_val)

print("Accuracy of Logistic Regression classifier:", logreg_accuracy)

# Create a RandomForestClassifier object

rf_classifier = RandomForestClassifier()

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Compute the accuracy of the classifier
rf_accuracy = rf_classifier.score(X_val, y_val)

print("Accuracy of Random Forest classifier:", rf_accuracy)



(2300, 9727) (575, 9727) (2300,) (575,)
Accuracy of Naive Bayes classifier: 0.18608695652173912
Accuracy of SVM classifier: 0.1826086956521739
Accuracy of Logistic Regression classifier: 0.4765217391304348
Accuracy of Random Forest classifier: 0.4469565217391304
