In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load the training data
train_data = pd.read_csv('/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt', sep=':::', names=['ID', 'Title', 'Genre', 'Description'], engine='python')
test_data = pd.read_csv('/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data.txt', sep=':::', names=['ID', 'Title', 'Genre', 'Description'], engine='python')
test_data_solution = pd.read_csv('/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data_solution.txt', sep=':::', names=['ID', 'Genre', 'Description'], engine='python')


In [3]:
train_data.head()
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           54214 non-null  int64 
 1   Title        54214 non-null  object
 2   Genre        54214 non-null  object
 3   Description  54214 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.7+ MB


In [4]:
# Preprocess the data
train_data['Genre'] = train_data['Genre'].str.strip()
test_data_solution['Genre'] = test_data_solution['Genre'].str.strip()

In [5]:
train_data.head()

Unnamed: 0,ID,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [6]:

# Split the data into training and testing sets
X_train = train_data['Description']
y_train = train_data['Genre']
X_test = test_data_solution['Description']
y_test = test_data_solution['Genre']

In [7]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [8]:
# Text Classification using Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
y_pred_nb = nb_classifier.predict(X_test_tfidf)

In [9]:
# Evaluate Naive Bayes model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Accuracy of Naive Bayes Model:", accuracy_nb)
classification_report_nb = classification_report(y_test, y_pred_nb)
print("Classification Report for Naive Bayes Model:\n", classification_report_nb)

Accuracy of Naive Bayes Model: 0.48380073800738005


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for Naive Bayes Model:
               precision    recall  f1-score   support

      action       0.49      0.04      0.08      1314
       adult       1.00      0.01      0.01       590
   adventure       0.25      0.00      0.00       775
   animation       0.33      0.00      0.00       498
   biography       0.00      0.00      0.00       264
      comedy       0.50      0.31      0.38      7446
       crime       0.50      0.00      0.00       505
 documentary       0.53      0.86      0.66     13096
       drama       0.42      0.82      0.56     13612
      family       0.50      0.00      0.01       783
     fantasy       0.00      0.00      0.00       322
   game-show       0.93      0.27      0.42       193
     history       0.00      0.00      0.00       243
      horror       0.68      0.18      0.28      2204
       music       0.66      0.13      0.22       731
     musical       0.00      0.00      0.00       276
     mystery       1.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# Text Classification using Logistic Regression
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_tfidf, y_train)
y_pred_lr = lr_classifier.predict(X_test_tfidf)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# Evaluate Logistic Regression model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Accuracy of Logistic Regression Model:", accuracy_lr)
classification_report_lr = classification_report(y_test, y_pred_lr)
print("Classification Report for Logistic Regression Model:\n", classification_report_lr)


Accuracy of Logistic Regression Model: 0.5430258302583025


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for Logistic Regression Model:
               precision    recall  f1-score   support

      action       0.38      0.22      0.27      1314
       adult       0.53      0.18      0.27       590
   adventure       0.36      0.10      0.16       775
   animation       0.35      0.05      0.09       498
   biography       0.00      0.00      0.00       264
      comedy       0.48      0.50      0.49      7446
       crime       0.28      0.04      0.07       505
 documentary       0.64      0.82      0.72     13096
       drama       0.52      0.74      0.61     13612
      family       0.37      0.09      0.14       783
     fantasy       0.50      0.05      0.10       322
   game-show       0.70      0.51      0.59       193
     history       0.25      0.00      0.01       243
      horror       0.54      0.47      0.50      2204
       music       0.60      0.43      0.50       731
     musical       0.26      0.04      0.07       276
     mystery       0.29    

  _warn_prf(average, modifier, msg_start, len(result))
