### Importing required packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Reading the dataset and giving it column names

In [2]:
column_names=['serial','name','genre','desc']

In [3]:
df_train= pd.read_table('Downloads/archive (3)/Genre Classification Dataset/train_data.txt', sep=":::", header= None, names= column_names, engine='python' )

In [4]:
df_train.head()

Unnamed: 0,serial,name,genre,desc
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


### Checking number of examples in each class

In [5]:
df_train.genre.value_counts()

genre
 drama           13613
 documentary     13096
 comedy           7447
 short            5073
 horror           2204
 thriller         1591
 action           1315
 western          1032
 reality-tv        884
 family            784
 adventure         775
 music             731
 romance           672
 sci-fi            647
 adult             590
 crime             505
 animation         498
 sport             432
 talk-show         391
 fantasy           323
 mystery           319
 musical           277
 biography         265
 history           243
 game-show         194
 news              181
 war               132
Name: count, dtype: int64

In [6]:
labels= df_train.genre

### Encoding the categorical labels into numbers

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
label_encoder = LabelEncoder()
df_train['encoded_labels']= label_encoder.fit_transform(labels)

In [9]:
df_train.head()

Unnamed: 0,serial,name,genre,desc,encoded_labels
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,8
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...,24
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,1
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,8
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,8


### Preprocessing the description by removing stop words and applying lemmatization

In [10]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [11]:
def preprocessing(text):
    no_stop_word=[]
    corpus= nlp(text)
    for words in corpus:
        if not words.is_stop and not words.is_punct:
            no_stop_word.append(words.lemma_)
    return " ".join(no_stop_word)

In [12]:
df_train['prepocessed_desc']= df_train.desc.apply(preprocessing)

In [13]:
df_train.shape

(54214, 6)

In [14]:
df_train.head()

Unnamed: 0,serial,name,genre,desc,encoded_labels,prepocessed_desc
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,8,listen conversation doctor parent 10 year ol...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...,24,brother sister past incestuous relationship ...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,1,bus empty student field trip Museum Natural ...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,8,help unemployed father end meet Edith twin s...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,8,film title refer un recover body ground zero...


In [49]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

### Splitting the dataset into train and test data

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df_train.prepocessed_desc, df_train.encoded_labels, test_size=0.2, random_state=42, stratify= df_train.encoded_labels)

### Checking for class imbalance

In [17]:
y_train.value_counts()

encoded_labels
8     10890
7     10477
5      5957
21     4058
13     1763
24     1273
0      1052
26      826
18      707
9       627
2       620
14      585
19      538
20      518
1       472
6       404
3       398
22      346
23      313
10      258
16      255
15      222
4       212
12      194
11      155
17      145
25      106
Name: count, dtype: int64

In [18]:
y_test.value_counts()

encoded_labels
8     2723
7     2619
5     1490
21    1015
13     441
24     318
0      263
26     206
18     177
9      157
2      155
14     146
19     134
20     129
1      118
6      101
3      100
22      86
23      78
10      65
16      64
15      55
4       53
12      49
11      39
17      36
25      26
Name: count, dtype: int64

### Calculating class weights to manage the class imbalance in the dataset

In [19]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

### Using Pipeline to apply tfidf along with Multinomial naive bayes

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB(class_prior=class_weights))
])

In [21]:
pipeline.fit(X_train,y_train)

In [22]:
y_pred= pipeline.predict(X_test)

In [23]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

### Printing classification report for Naive Bayes model

In [24]:
print(classification_report(y_test,y_pred,zero_division=0))

              precision    recall  f1-score   support

           0       0.44      0.02      0.03       263
           1       0.44      0.10      0.17       118
           2       0.73      0.05      0.10       155
           3       0.50      0.02      0.04       100
           4       0.00      0.00      0.00        53
           5       0.57      0.29      0.38      1490
           6       0.00      0.00      0.00       101
           7       0.54      0.89      0.67      2619
           8       0.44      0.82      0.57      2723
           9       1.00      0.01      0.01       157
          10       0.33      0.02      0.03        65
          11       0.53      0.23      0.32        39
          12       0.00      0.00      0.00        49
          13       0.72      0.18      0.28       441
          14       0.62      0.11      0.19       146
          15       0.67      0.04      0.07        55
          16       0.00      0.00      0.00        64
          17       0.00    

In [25]:
accuracy_score(y_test,y_pred)

0.49045467121645303

### Creating pipeline to apply tfidf and Logistic Regression

In [26]:
pipeline2 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logistic', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

In [27]:
pipeline2.fit(X_train,y_train)

In [28]:
y_pred2= pipeline2.predict(X_test)

### Printing classification report for Logistic Regression model

In [29]:
print(classification_report(y_test,y_pred2,zero_division=0))

              precision    recall  f1-score   support

           0       0.32      0.48      0.38       263
           1       0.40      0.74      0.52       118
           2       0.22      0.34      0.26       155
           3       0.22      0.30      0.25       100
           4       0.05      0.08      0.06        53
           5       0.61      0.47      0.53      1490
           6       0.15      0.35      0.21       101
           7       0.78      0.59      0.67      2619
           8       0.70      0.41      0.51      2723
           9       0.17      0.37      0.23       157
          10       0.14      0.26      0.18        65
          11       0.65      0.67      0.66        39
          12       0.10      0.24      0.14        49
          13       0.59      0.67      0.63       441
          14       0.37      0.82      0.51       146
          15       0.10      0.18      0.13        55
          16       0.12      0.17      0.14        64
          17       0.18    

In [30]:
accuracy_score(y_test,y_pred2)

0.48907129023333024

### Reading the test dataset provided

In [50]:
df_test= pd.read_table('Downloads/archive (3)/Genre Classification Dataset/test_data.txt', sep=':::', header= None, names=['serial','name','desc'], engine='python')

In [32]:
df_test.head()

Unnamed: 0,serial,name,desc
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...


In [33]:
df_test['prepocessed_desc']= df_test.desc.apply(preprocessing)

In [None]:
df_test.head()

In [35]:
X_test_new= df_test.prepocessed_desc

### Applying Naive Bayes model to the test data

In [39]:
y_pred_new= pipeline.predict(X_test_new)

### Applying Logistic Regression Model to the test data

In [38]:
y_pred2_new= pipeline2.predict(X_test_new)

### Importing the test data solutions

In [40]:
df_sol= pd.read_csv('Downloads/archive (3)/Genre Classification Dataset/test_data_solution.txt',sep=":::", header= None, names= column_names, engine='python' )

In [42]:
df_sol.genre

0            thriller 
1              comedy 
2         documentary 
3               drama 
4               drama 
             ...      
54195          horror 
54196         western 
54197           adult 
54198           drama 
54199           drama 
Name: genre, Length: 54200, dtype: object

### Applying label encoding to the test data labels

In [45]:
df_sol['encoded_labels']= label_encoder.transform(df_sol.genre)

### Printing the accuracy scores & classification report for the respective models

In [52]:
accuracy_score(df_sol.encoded_labels, y_pred_new) #Naive Bayes accuracy

0.4918819188191882

In [51]:
accuracy_score(df_sol.encoded_labels, y_pred2_new) #Logistic Regression Accuracy

0.48677121771217713

In [57]:
print(classification_report(df_sol.encoded_labels,y_pred_new,zero_division=0)) #Naive Bayes classification report

              precision    recall  f1-score   support

           0       0.62      0.05      0.09      1314
           1       0.53      0.11      0.18       590
           2       0.83      0.07      0.12       775
           3       0.67      0.02      0.03       498
           4       0.00      0.00      0.00       264
           5       0.58      0.29      0.39      7446
           6       0.17      0.00      0.00       505
           7       0.54      0.88      0.67     13096
           8       0.43      0.82      0.57     13612
           9       0.62      0.01      0.01       783
          10       0.44      0.02      0.04       322
          11       0.47      0.30      0.36       193
          12       0.00      0.00      0.00       243
          13       0.68      0.20      0.30      2204
          14       0.63      0.12      0.21       731
          15       0.15      0.02      0.03       276
          16       0.50      0.01      0.02       318
          17       0.09    

In [54]:
print(classification_report(df_sol.encoded_labels,y_pred2_new,zero_division=0)) # Logistic Regression 

              precision    recall  f1-score   support

           0       0.31      0.49      0.38      1314
           1       0.36      0.66      0.46       590
           2       0.24      0.35      0.28       775
           3       0.21      0.28      0.24       498
           4       0.05      0.09      0.07       264
           5       0.59      0.46      0.52      7446
           6       0.15      0.34      0.21       505
           7       0.78      0.59      0.67     13096
           8       0.70      0.41      0.52     13612
           9       0.16      0.33      0.21       783
          10       0.15      0.28      0.19       322
          11       0.62      0.74      0.68       193
          12       0.10      0.24      0.14       243
          13       0.56      0.67      0.61      2204
          14       0.36      0.74      0.48       731
          15       0.16      0.30      0.21       276
          16       0.12      0.18      0.14       318
          17       0.22    