**Import the libary**

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Data load**

In [2]:
data = []
with open("train_data.txt","r",encoding="utf-8") as file:
    for line in file:
        parts = line.strip().split(":::")
        if len(parts)==4:
            data.append(parts)
train_data = pd.DataFrame(data,columns=["id","title","genre","description"])

In [3]:
data_test = []
with open("test_data.txt","r",encoding="utf-8") as file:
    for line in file:
        parts = line.strip().split(":::")
        if len(parts)==3:
            data_test.append(parts)
test_data = pd.DataFrame(data_test,columns=["id","title","description"])

**# Explore data analysis**

# train data explore analysis

In [4]:
train_data.head()

Unnamed: 0,id,title,genre,description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [5]:
train_data.describe()

Unnamed: 0,id,title,genre,description
count,54214,54214,54214,54214
unique,54214,54214,27,54086
top,1,Oscar et la dame rose (2009),drama,Grammy - music award of the American academy ...
freq,1,1,13613,12


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           54214 non-null  object
 1   title        54214 non-null  object
 2   genre        54214 non-null  object
 3   description  54214 non-null  object
dtypes: object(4)
memory usage: 1.7+ MB


In [7]:
train_data.nunique().sort_values(ascending=False)

id             54214
title          54214
description    54086
genre             27
dtype: int64

# test data explore analysis

In [6]:
test_data.head()

Unnamed: 0,id,title,description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...


In [10]:
test_data.describe()

Unnamed: 0,id,title,description
count,54200,54200,54200
unique,54200,54200,54072
top,1,Edgar's Lunch (1998),Grammy - music award of the American academy ...
freq,1,1,10


In [7]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54200 entries, 0 to 54199
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           54200 non-null  object
 1   title        54200 non-null  object
 2   description  54200 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [8]:
test_data.nunique().sort_values(ascending=False)

id             54200
title          54200
description    54072
dtype: int64

**# Text preprocess**

In [9]:
#remove id column from both train and test data 
train_data = train_data.drop("id",axis=1)
test_data = test_data.drop("id",axis=1)

In [10]:
# Load stopwords
stop_words = set(stopwords.words('english'))

# Combine title and description, clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    text = re.sub(r"\d+", "", text)  # remove digits
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# Create combined and cleaned text field
train_data["text"] = (train_data["title"] + " " + train_data["description"]).apply(clean_text)
test_data["text"] = (test_data["title"] + " " + test_data["description"]).apply(clean_text)


In [15]:
#vectorize
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train = tfidf.fit_transform(train_data["text"])
X_test = tfidf.transform(test_data["text"])

y_train = train_data["genre"]

In [17]:
# Split for validation
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [18]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_tr, y_tr= smote.fit_resample(X_tr, y_tr)


**# Model build and test**

In [19]:
models = {
    "LogisticRegression":LogisticRegression(class_weight='balanced', max_iter=1000),
    "MultinomalNB":MultinomialNB(),
    "SupportVectorMachine":SVC(class_weight='balanced')
}

In [None]:
#evaluate with valdation 
for name,model in models.items():
    model.fit(X_tr,y_tr)
    y_pred = model.predict(X_val)
    print(f"\n{name} Result: ")
    print("classification score: ",classification_report(y_pred,y_val))
    print("confusion matrixs: ", confusion_matrix(y_pred,y_val))
    



LogisticRegression Result: 
classification score:                 precision    recall  f1-score   support

      action        0.47      0.35      0.40       351
       adult        0.54      0.45      0.49       133
   adventure        0.31      0.24      0.27       181
   animation        0.26      0.24      0.25       111
   biography        0.03      0.06      0.04        35
      comedy        0.57      0.58      0.57      1416
       crime        0.21      0.15      0.18       155
 documentary        0.69      0.78      0.73      2342
       drama        0.53      0.67      0.59      2162
      family        0.27      0.17      0.20       242
     fantasy        0.11      0.13      0.12        60
   game-show        0.72      0.81      0.76        36
     history        0.04      0.03      0.04        64
      horror        0.68      0.61      0.64       484
       music        0.71      0.47      0.57       216
     musical        0.20      0.18      0.19        57
     mystery

: 

In [None]:
#evaluate with test data
for name,model in models.items():
    model.fit(X_tr,y_tr)
    y_pred = model.predict(X_val)
    print(f"\n{name} Result: ")
    print("accurcy score: ",accuracy_score(y_pred,y_val))
    print("classification score: ",classification_report(y_pred,y_val))
    print("confusion matrixs: ", confusion_matrix(y_pred,y_val))
    
