In [1]:
import nltk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix

In [2]:
train_data=pd.read_csv("C:\\Users\\barsh\\Desktop\\Genre Classification Dataset\\train_data.txt",sep=":::",names=['Title','Genre','Description'],engine='python')

In [3]:
train_data.head(5)

Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [4]:
train_data['Genre'].unique()

array([' drama ', ' thriller ', ' adult ', ' documentary ', ' comedy ',
       ' crime ', ' reality-tv ', ' horror ', ' sport ', ' animation ',
       ' action ', ' fantasy ', ' short ', ' sci-fi ', ' music ',
       ' adventure ', ' talk-show ', ' western ', ' family ', ' mystery ',
       ' history ', ' news ', ' biography ', ' romance ', ' game-show ',
       ' musical ', ' war '], dtype=object)

In [5]:
train_data['Genre'].value_counts()

Genre
drama           13613
documentary     13096
comedy           7447
short            5073
horror           2204
thriller         1591
action           1315
western          1032
reality-tv        884
family            784
adventure         775
music             731
romance           672
sci-fi            647
adult             590
crime             505
animation         498
sport             432
talk-show         391
fantasy           323
mystery           319
musical           277
biography         265
history           243
game-show         194
news              181
war               132
Name: count, dtype: int64

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54214 entries, 1 to 54214
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        54214 non-null  object
 1   Genre        54214 non-null  object
 2   Description  54214 non-null  object
dtypes: object(3)
memory usage: 1.7+ MB


In [7]:
def remove_punct(text):
    import string as st
    punct=st.punctuation
    new_text=''.join([char for char in text if char not in punct])
    return new_text

train_data['Description']=train_data['Description'].apply(lambda x:remove_punct(x))

In [8]:
train_data['Description']=train_data['Description'].apply(lambda x:x.lower())
train_data.head()

Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,listening in to a conversation between his do...
2,Cupid (1997),thriller,a brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,as the bus empties the students for their fie...
4,The Secret Sin (1915),drama,to help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,the films title refers not only to the unreco...


In [9]:
def toknize(text):
    from nltk.tokenize import word_tokenize
    new_text=word_tokenize(text)
    return new_text
train_data['Description']=train_data['Description'].apply(lambda x:toknize(x))

In [10]:
def stemming(text):
    from nltk import PorterStemmer
    ps=PorterStemmer().stem

    new_text=" ".join([ps(word) for word in text ])
    return new_text
train_data['Description']=train_data['Description'].apply(lambda x : stemming(x))

In [11]:
tf_idf = TfidfVectorizer()
Description_tf_idf = tf_idf.fit_transform(train_data['Description'])

Description_tf_idf = pd.DataFrame.sparse.from_spmatrix(
    Description_tf_idf, 
    columns=tf_idf.get_feature_names_out()
)

Description_tf_idf


Unnamed: 0,00,000,000km,002,003,007,0074,0077,007paul,00o,...,ťthrillerť,ťtitanicť,ťto,ťŕ,żestán,żi,żo,żpleasur,żte,żwhere
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54209,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
54210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
54211,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
54212,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
train_data['Description len']=train_data['Description'].apply(lambda x:len(x)-x.count(' '))
train_data.head(10)

Unnamed: 0,Title,Genre,Description,Description len
1,Oscar et la dame rose (2009),drama,listen in to a convers between hi doctor and p...,390
2,Cupid (1997),thriller,a brother and sister with a past incestu relat...,142
3,"Young, Wild and Wonderful (1980)",adult,as the bu empti the student for their field tr...,457
4,The Secret Sin (1915),drama,to help their unemploy father make end meet ed...,794
5,The Unrecovered (2007),drama,the film titl refer not onli to the unrecov bo...,451
6,Quality Control (2011),documentary,qualiti control consist of a seri of 16mm sing...,1149
7,"""Pink Slip"" (2009)",comedy,in tough econom time max and joey have all but...,240
8,One Step Away (1985),crime,ron petri keanu reev is a troubl teen whose li...,225
9,"""Desperate Hours"" (2016)",reality-tv,a sudden calamit event caus great loss of life...,328
10,Spirits (2014/I),horror,four high school student embark on a terrifi j...,322


In [13]:
scaler=MinMaxScaler()
train_data['Description_len_sclr'] = scaler.fit_transform(train_data['Description len'].values.reshape(-1, 1))
train_data.head(5)

Unnamed: 0,Title,Genre,Description,Description len,Description_len_sclr
1,Oscar et la dame rose (2009),drama,listen in to a convers between hi doctor and p...,390,0.048047
2,Cupid (1997),thriller,a brother and sister with a past incestu relat...,142,0.014763
3,"Young, Wild and Wonderful (1980)",adult,as the bu empti the student for their field tr...,457,0.057039
4,The Secret Sin (1915),drama,to help their unemploy father make end meet ed...,794,0.102268
5,The Unrecovered (2007),drama,the film titl refer not onli to the unrecov bo...,451,0.056234


In [14]:
target=train_data['Genre']
feature=train_data.drop(['Genre','Title','Description','Description len'],axis=1)

In [15]:
target

1               drama 
2            thriller 
3               adult 
4               drama 
5               drama 
             ...      
54210          comedy 
54211          horror 
54212     documentary 
54213          comedy 
54214         history 
Name: Genre, Length: 54214, dtype: object

In [16]:
feature

Unnamed: 0,Description_len_sclr
1,0.048047
2,0.014763
3,0.057039
4,0.102268
5,0.056234
...,...
54210,0.043350
54211,0.067776
54212,0.019997
54213,0.058113


In [17]:
target.shape , feature.shape

((54214,), (54214, 1))

In [18]:
Description_tf_idf.head()

Unnamed: 0,00,000,000km,002,003,007,0074,0077,007paul,00o,...,ťthrillerť,ťtitanicť,ťto,ťŕ,żestán,żi,żo,żpleasur,żte,żwhere
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
feature.reset_index(drop=True, inplace=True)
Description_tf_idf.reset_index(drop=True, inplace=True)
feature = pd.concat([feature, pd.DataFrame(Description_tf_idf)], axis=1)

In [20]:
feature.shape

(54214, 114971)

In [21]:
feature.head()

Unnamed: 0,Description_len_sclr,00,000,000km,002,003,007,0074,0077,007paul,...,ťthrillerť,ťtitanicť,ťto,ťŕ,żestán,żi,żo,żpleasur,żte,żwhere
0,0.048047,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.014763,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.057039,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.102268,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.056234,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
x_train, x_test, y_train, y_test = train_test_split(feature,target,test_size=0.6)
print('x_train:',x_train.shape,'x_test:',x_test.shape,'y_train:',y_train.shape,'y_test:',y_test.shape)

x_train: (21685, 114971) x_test: (32529, 114971) y_train: (21685,) y_test: (32529,)


In [23]:
model = RandomForestClassifier(n_estimators=1000, max_depth=100, random_state=42)
model.fit(x_train, y_train)



In [24]:
model.score(x_train,y_train)



0.9991238183075859

In [91]:
predicted=model.predict(x_test[:5])



In [92]:
print(predicted)
print(y_test[:5])

[' documentary ' ' documentary ' ' documentary ' ' drama ' ' documentary ']
41385     documentary 
28279     documentary 
49334     documentary 
12447           drama 
36026           drama 
Name: Genre, dtype: object


In [94]:
a=accuracy_score(predicted,y_test[0:5])
print("the accuracy of the model on the testing set is : ",a*100,"%")

the accuracy of the model on the testing set is :  80.0 %
