<a href="https://colab.research.google.com/github/Aman-pr/Indo-Machine-learning-/blob/main/Movie_gener.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

In [65]:
train_data = pd.read_csv(
    '/content/train_data.txt',
    sep=':::',  # Split by ':::'
    header=None,
    names=['id', 'title', 'genre', 'description'],
    engine='python'
)

print("Training Data:")
print(train_data.head())

y_train = train_data['genre']
y_test = test_data['genre']

print("\nMissing genres in y_train:", y_train.isnull().sum())

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

original_genre_names = label_encoder.classes_
print("\nOriginal genre names:", original_genre_names)

Training Data:
   id                               title       genre  \
0   1       Oscar et la dame rose (2009)       drama    
1   2                       Cupid (1997)    thriller    
2   3   Young, Wild and Wonderful (1980)       adult    
3   4              The Secret Sin (1915)       drama    
4   5             The Unrecovered (2007)       drama    

                                         description  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  

Missing genres in y_train: 0

Original genre names: [' action ' ' adult ' ' adventure ' ' animation ' ' biography ' ' comedy '
 ' crime ' ' documentary ' ' drama ' ' family ' ' fantasy ' ' game-show '
 ' history ' ' horror ' ' music ' ' musical ' ' mystery ' ' news '
 ' reality-tv ' ' romance ' ' sci-fi ' ' sh

In [66]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [61]:
print("\nClass Distribution Before SMOTE:", pd.Series(y_train).value_counts())
if len(np.unique(y_train)) > 1:
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)
    print("Class Distribution After SMOTE:", pd.Series(y_train_resampled).value_counts())
else:
    print("Skipping SMOTE: Only one class found.")
    X_train_resampled, y_train_resampled = X_train_tfidf, y_train



Class Distribution Before SMOTE: 8     13613
7     13096
5      7447
21     5073
13     2204
24     1591
0      1315
26     1032
18      884
9       784
2       775
14      731
19      672
20      647
1       590
6       505
3       498
22      432
23      391
10      323
16      319
15      277
4       265
12      243
11      194
17      181
25      132
Name: count, dtype: int64
Class Distribution After SMOTE: 8     13613
14    13613
15    13613
11    13613
19    13613
4     13613
17    13613
12    13613
16    13613
9     13613
26    13613
23    13613
2     13613
20    13613
24    13613
21    13613
10    13613
0     13613
3     13613
22    13613
13    13613
18    13613
6     13613
5     13613
7     13613
1     13613
25    13613
Name: count, dtype: int64


In [62]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_resampled, y_train_resampled)
y_pred_dt = dt_model.predict(X_test_tfidf)
print("\nDecision Tree F1-Score:", f1_score(y_test, y_pred_dt, average='weighted'))
print(classification_report(y_test, y_pred_dt, target_names=original_genre_names))


Decision Tree F1-Score: 0.32867925467630305
               precision    recall  f1-score   support

      action        0.12      0.12      0.12      1314
       adult        0.22      0.31      0.25       590
   adventure        0.16      0.15      0.16       775
   animation        0.11      0.12      0.11       498
   biography        0.02      0.03      0.02       264
      comedy        0.29      0.28      0.29      7446
       crime        0.08      0.10      0.09       505
 documentary        0.52      0.47      0.50     13096
       drama        0.40      0.36      0.38     13612
      family        0.07      0.07      0.07       783
     fantasy        0.05      0.05      0.05       322
   game-show        0.49      0.49      0.49       193
     history        0.03      0.05      0.04       243
      horror        0.27      0.28      0.28      2204
       music        0.30      0.39      0.34       731
     musical        0.09      0.17      0.11       276
     mystery       

In [63]:
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_resampled, y_train_resampled)

y_pred_lr = lr_model.predict(X_test_tfidf)
print("\nLogistic Regression F1-Score:", f1_score(y_test, y_pred_lr, average='weighted'))
print(classification_report(y_test, y_pred_lr, target_names=original_genre_names))


Logistic Regression F1-Score: 0.5253512736728788
               precision    recall  f1-score   support

      action        0.31      0.44      0.36      1314
       adult        0.38      0.55      0.45       590
   adventure        0.19      0.30      0.24       775
   animation        0.17      0.24      0.20       498
   biography        0.04      0.11      0.06       264
      comedy        0.58      0.49      0.53      7446
       crime        0.14      0.28      0.18       505
 documentary        0.79      0.64      0.70     13096
       drama        0.68      0.47      0.55     13612
      family        0.15      0.30      0.20       783
     fantasy        0.13      0.22      0.16       322
   game-show        0.66      0.69      0.68       193
     history        0.09      0.17      0.11       243
      horror        0.57      0.61      0.59      2204
       music        0.45      0.65      0.53       731
     musical        0.14      0.24      0.18       276
     mystery  