In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
import warnings
 
warnings.filterwarnings(action = 'ignore')
 

In [37]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

stop_words = set(stopwords.words('english') + ['reuter', '\x03'])
lemmatizer = WordNetLemmatizer()
# stemmer = PorterStemmer()

def preprocessor(text: str):
    text = text.lower()

    table = str.maketrans('', '', string.punctuation)
    text = text.translate(table)

    text = re.sub(r'\d+', 'num', text)

    text = [word for word in text.split() if word not in stop_words]

    text = [lemmatizer.lemmatize(word) for word in text]
    
    # text = [stemmer.stem(word) for word in text]

    return " ".join(text)


In [38]:
df = pd.read_csv("Training-dataset.csv")

comedy_df = df.loc[df["comedy"] == 1]
cult_df = df.loc[df["cult"] == 1]
flashback_df = df.loc[df["flashback"] == 1]
historical_df = df.loc[df["historical"] == 1]
murder_df = df.loc[df["murder"] == 1]
revenge_df = df.loc[df["revenge"] == 1]
romantic_df = df.loc[df["romantic"] == 1]
scifi_df = df.loc[df["scifi"] == 1]
violence_df = df.loc[df["violence"] == 1]

sep_label_df = [comedy_df, cult_df, flashback_df,
    historical_df,
    murder_df,
    revenge_df,
    romantic_df,
    scifi_df,
    violence_df
]
col_val = 3
for i in sep_label_df:
    print(f"Number of '{i.columns[col_val]}' plots: {i.shape[0]}")
    col_val += 1

df['text'] = df['title'] + ' ' + df['plot_synopsis']
training_data = df[['text', 'comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']]
training_data.head() 

Number of 'comedy' plots: 1262
Number of 'cult' plots: 1801
Number of 'flashback' plots: 1994
Number of 'historical' plots: 186
Number of 'murder' plots: 4019
Number of 'revenge' plots: 1657
Number of 'romantic' plots: 2006
Number of 'scifi' plots: 204
Number of 'violence' plots: 3064


Unnamed: 0,text,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,Si wang ta After a recent amount of challenges...,0,0,0,0,1,1,0,0,1
1,Shattered Vengeance In the crime-ridden city o...,0,0,0,0,1,1,1,0,1
2,L'esorciccio Lankester Merrin is a veteran Cat...,0,1,0,0,0,0,0,0,0
3,"Serendipity Through Seasons ""Serendipity Throu...",0,0,0,0,0,0,1,0,0
4,The Liability Young and naive 19-year-old slac...,0,0,1,0,0,0,0,0,0


In [39]:
def training_rows(data, perc=0.8):
    return data.head(int(len(data)*(perc)))
def testing_rows(data, train):    
    return data.iloc[len(train):]

In [40]:
train_id_set = []
test_id_set = []
for i in sep_label_df:
    i_train = training_rows(i)
    i_test = testing_rows(i, i_train)
    train_id_set.extend(i_train.index.unique())
    test_id_set.extend(i_test.index.unique())
    
train_id_set = set(train_id_set)
test_id_set = set(test_id_set)

In [41]:
training_data['preprocessed_text'] = training_data['text'].apply(preprocessor)
training_data.head()

Unnamed: 0,text,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence,preprocessed_text
0,Si wang ta After a recent amount of challenges...,0,0,0,0,1,1,0,0,1,si wang ta recent amount challenge billy lo br...
1,Shattered Vengeance In the crime-ridden city o...,0,0,0,0,1,1,1,0,1,shattered vengeance crimeridden city tremont r...
2,L'esorciccio Lankester Merrin is a veteran Cat...,0,1,0,0,0,0,0,0,0,lesorciccio lankester merrin veteran catholic ...
3,"Serendipity Through Seasons ""Serendipity Throu...",0,0,0,0,0,0,1,0,0,serendipity season serendipity season heartwar...
4,The Liability Young and naive 19-year-old slac...,0,0,1,0,0,0,0,0,0,liability young naive numyearold slacker adam ...


In [42]:

X_train = training_data.loc[train_id_set, "preprocessed_text"]
X_test = training_data.loc[test_id_set, "preprocessed_text"]

y_train = df.loc[train_id_set, ['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']]
y_test = df.loc[test_id_set, ['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']]


In [43]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', OneVsRestClassifier(LinearSVC()))
])

# Define the parameter grid to search
param_grid = {
    'classifier__estimator__C': [0.1, 0.5, 0.8, 1],  
    'classifier__estimator__dual': ["auto"],
    'tfidf__max_df': [0.3, 0.5, 0.8, 1]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='f1_samples', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters: ", grid_search.best_params_)
print("Best Cross-validated Accuracy: {:.2f}".format(grid_search.best_score_))

best_pipeline = grid_search.best_estimator_
test_accuracy = best_pipeline.score(X_test, y_test)
print("Test Set Accuracy: {:.2f}".format(test_accuracy))


Best Parameters:  {'classifier__estimator__C': 1, 'classifier__estimator__dual': 'auto', 'tfidf__max_df': 0.8}
Best Cross-validated Accuracy: 0.43
Test Set Accuracy: 0.23


In [44]:
tfidf_vectorizer = TfidfVectorizer(max_df=grid_search.best_params_['tfidf__max_df'])
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

svm_classifier = OneVsRestClassifier(LinearSVC(C=grid_search.best_params_['classifier__estimator__C']))
svm_classifier.fit(X_train_tfidf, y_train)


In [45]:
y_pred = svm_classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.5f}')

classification_rep = classification_report(y_test, y_pred, target_names=['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence'])
print('Classification Report:')
print(classification_rep)

Accuracy: 0.23234
Classification Report:
              precision    recall  f1-score   support

      comedy       0.59      0.16      0.25       257
        cult       0.58      0.29      0.38       364
   flashback       0.57      0.27      0.36       414
  historical       0.92      0.29      0.44        38
      murder       0.75      0.69      0.72       829
     revenge       0.57      0.24      0.34       336
    romantic       0.73      0.47      0.57       423
       scifi       0.56      0.11      0.19        44
    violence       0.70      0.61      0.65       623

   micro avg       0.69      0.45      0.55      3328
   macro avg       0.66      0.35      0.43      3328
weighted avg       0.67      0.45      0.52      3328
 samples avg       0.55      0.47      0.47      3328



In [46]:
validation_file = pd.read_csv("Task-2-validation-dataset.csv")
validation_file.head()


Unnamed: 0,ID,title,plot_synopsis,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,cf32cb00-172d-40f2-a3c1-936e8a0d89d7,Shattered Hearts,"In the enchanting city of Verona, Italy, renow...",0,0,0,0,1,0,1,0,0
1,df7e125e-2d59-40e4-a126-9397e3a0ef21,Point Blank,Walker works with his friend Mal Reese to stea...,0,1,1,0,1,1,0,0,1
2,49bc73f3-9179-41cd-9774-905c7a3ac91b,Le charme discret de la bourgeoisie,The film consists of several thematically link...,1,0,1,0,0,0,0,0,0
3,0ed4822b-87af-44bc-a677-7f7abfdaccf3,A Gentleman's Dignity,A Gentleman's Dignity is about the careers and...,0,0,0,0,0,0,1,0,0
4,0b1b0fa4-43bc-41ba-9598-b3401894b96d,Carmen: A Hip Hopera,"Carmen Brown (Beyoncé) is a seductive, aspirin...",0,0,0,0,1,0,0,0,0


In [47]:
validation_file['text'] = validation_file['title'] + ' ' + validation_file['plot_synopsis']
val_data = validation_file[['ID','text', 'comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']]
val_data['preprocessed_text'] = val_data['text'].apply(preprocessor)

X_validation_train = training_data["preprocessed_text"]
X_validation_test = val_data['preprocessed_text']

y_validation_train = training_data[['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']]
y_validation_test = val_data[['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']]


tfidf_vectorizer = TfidfVectorizer(max_df=grid_search.best_params_['tfidf__max_df'])
X_validation_train_tfidf = tfidf_vectorizer.fit_transform(X_validation_train)
X_validation_test_tfidf = tfidf_vectorizer.transform(X_validation_test)

svm_classifier = OneVsRestClassifier(LinearSVC(C=grid_search.best_params_['classifier__estimator__C']))
svm_classifier.fit(X_validation_train_tfidf, y_validation_train)

In [48]:
y_validation_pred = svm_classifier.predict(X_validation_test_tfidf)
accuracy = accuracy_score(y_validation_test, y_validation_pred)
print(f'Accuracy: {accuracy:.5f}')
classification_rep = classification_report(y_validation_test, y_validation_pred, target_names=['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence'])
print('Classification Report:')
print(classification_rep)

Accuracy: 0.20118
Classification Report:
              precision    recall  f1-score   support

      comedy       0.55      0.18      0.27       175
        cult       0.53      0.30      0.38       247
   flashback       0.52      0.26      0.34       294
  historical       0.33      0.04      0.07        24
      murder       0.70      0.68      0.69       581
     revenge       0.52      0.21      0.30       237
    romantic       0.68      0.49      0.57       290
       scifi       1.00      0.06      0.12        31
    violence       0.64      0.56      0.60       420

   micro avg       0.64      0.44      0.52      2299
   macro avg       0.61      0.31      0.37      2299
weighted avg       0.62      0.44      0.49      2299
 samples avg       0.52      0.46      0.46      2299



In [49]:
df_nd_array = pd.DataFrame(y_validation_pred,  columns=['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence'])
result_df = pd.concat([val_data['ID'], df_nd_array], axis=1)
result_df.head()

Unnamed: 0,ID,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,cf32cb00-172d-40f2-a3c1-936e8a0d89d7,0,0,0,0,1,0,1,0,0
1,df7e125e-2d59-40e4-a126-9397e3a0ef21,0,0,0,0,1,0,0,0,0
2,49bc73f3-9179-41cd-9774-905c7a3ac91b,0,0,0,0,1,0,0,0,0
3,0ed4822b-87af-44bc-a677-7f7abfdaccf3,0,0,0,0,0,0,1,0,0
4,0b1b0fa4-43bc-41ba-9598-b3401894b96d,0,0,0,0,1,0,1,0,1


In [50]:
result_df.to_csv('prediction_file_Task2_a.csv', header = False, index = False)