In [None]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Remove punctuation and lowercase text
    text = re.sub(r'\W', ' ', text)
    text = text.lower()

    # Tokenize text
    words = word_tokenize(text)

    # Remove stop words and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(words)

In [None]:

# Read train text file into a DataFrame
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    data = []
    for line in lines:
        line = line.strip()
        if line:
            parts = line.split(':::')
            if len(parts) >= 3:
                title = parts[1].strip()
                genre = parts[2].strip()
                plot = parts[3].strip()
                data.append((title, genre, plot))

    df = pd.DataFrame(data, columns=['Title', 'Genre', 'Plot'])
    return df

In [None]:
file_path = '/content/drive/MyDrive/encryptix/movie dataset/train_data.txt'
df = read_text_file(file_path)


In [None]:
df

Unnamed: 0,Title,Genre,Plot,cleaned_plot
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...,listening conversation doctor parent 10 year o...
1,Cupid (1997),thriller,A brother and sister with a past incestuous re...,brother sister past incestuous relationship cu...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...,bus empty student field trip museum natural hi...
3,The Secret Sin (1915),drama,To help their unemployed father make ends meet...,help unemployed father make end meet edith twi...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...,film title refers un recovered body ground zer...
...,...,...,...,...
54209,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on B...,short lived nbc live sitcom centered bonino wo...
54210,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The siste...,next generation exploitation sister kapa bay s...
54211,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about gr...",ze bestaan echt stand comedy growing facing fe...
54212,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and have...,walter vivian live country difficult time keep...


In [None]:
df['Genre'].value_counts()

Genre
drama          13613
documentary    13096
comedy          7447
short           5073
horror          2204
thriller        1591
action          1315
western         1032
reality-tv       884
family           784
adventure        775
music            731
romance          672
sci-fi           647
adult            590
crime            505
animation        498
sport            432
talk-show        391
fantasy          323
mystery          319
musical          277
biography        265
history          243
game-show        194
news             181
war              132
Name: count, dtype: int64

In [None]:
# Preprocess plot summaries
df['cleaned_plot'] = df['Plot'].apply(preprocess_text)

# Encode the target labels using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['Genre'].apply(lambda x: [x]))

In [None]:
# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_plot']).toarray()

In [None]:
df

Unnamed: 0,Title,Genre,Plot,cleaned_plot
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...,listening conversation doctor parent 10 year o...
1,Cupid (1997),thriller,A brother and sister with a past incestuous re...,brother sister past incestuous relationship cu...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...,bus empty student field trip museum natural hi...
3,The Secret Sin (1915),drama,To help their unemployed father make ends meet...,help unemployed father make end meet edith twi...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...,film title refers un recovered body ground zer...
...,...,...,...,...
54209,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on B...,short lived nbc live sitcom centered bonino wo...
54210,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The siste...,next generation exploitation sister kapa bay s...
54211,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about gr...",ze bestaan echt stand comedy growing facing fe...
54212,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and have...,walter vivian live country difficult time keep...


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Model training using Logistic Regression with MultiOutputClassifier
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)

In [None]:
# Model evaluation
y_pred = model.predict(X_test)


In [None]:
# Decode predictions to genre labels
y_pred_labels = mlb.inverse_transform(y_pred)
y_test_labels = mlb.inverse_transform(y_test)
print(y_test,y_pred)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')

print(report)

Accuracy: 0.35313105229180114
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.08      0.14       263
           1       0.90      0.08      0.15       112
           2       0.50      0.04      0.07       139
           3       0.00      0.00      0.00       104
           4       0.00      0.00      0.00        61
           5       0.75      0.30      0.42      1443
           6       1.00      0.01      0.02       107
           7       0.81      0.67      0.73      2659
           8       0.68      0.45      0.55      2697
           9       1.00      0.03      0.05       150
          10       0.00      0.00      0.00        74
          11       1.00      0.23      0.37        40
          12       0.00      0.00      0.00        45
          13       0.81      0.31      0.45       431
          14       0.62      0.22      0.32       144
          15       0.00      0.00      0.00        50
          16       0.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

# Read test text file into a DataFrame
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    data = []
    for line in lines:
        line = line.strip()
        if line:
            parts = line.split(':::')
            if len(parts) >= 3:
                title = parts[1].strip()

                plot = parts[2].strip()
                data.append((title,  plot))

    df_test = pd.DataFrame(data, columns=['Title',  'Plot'])
    return df_test

In [None]:
file_path = '/content/drive/MyDrive/encryptix/movie dataset/test_data.txt'  # Replace with your file path
df_test = read_text_file(file_path)


In [None]:
df_test

Unnamed: 0,Title,Plot
0,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,Er nu zhai (1955),Before he was known internationally as a marti...
...,...,...
54195,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Dar..."
54196,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their ...
54197,Oliver Twink (2007),"A movie 169 years in the making. Oliver Twist,..."
54198,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard ..."


In [None]:
# Preprocess plot summaries
df_test['cleaned_plot'] = df_test['Plot'].apply(preprocess_text)

In [None]:
df_test

Unnamed: 0,Title,Plot,cleaned_plot
0,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart...",l r brane love life car apartment job especial...
1,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi...",spain march 1964 quico naughty child three bel...
2,Off the Beaten Track (2010),One year in the life of Albin and his family o...,one year life albin family shepherd north tran...
3,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his...",father died spoken brother 10 year serious can...
4,Er nu zhai (1955),Before he was known internationally as a marti...,known internationally martial art superstar br...
...,...,...,...
54195,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Dar...",covering multiple genre tale light dark anthol...
54196,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their ...,alice cora munro attempt find father british o...
54197,Oliver Twink (2007),"A movie 169 years in the making. Oliver Twist,...",movie 169 year making oliver twist artful dodg...
54198,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard ...",popular mysterious rock j mike mallard askew b...


In [None]:
# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
Z = tfidf.fit_transform(df_test['cleaned_plot']).toarray()

In [None]:
# Model evaluation
y_pred = model.predict(Z)


In [None]:
# Decode predictions to genre labels
y_pred_labels = mlb.inverse_transform(y_pred)

In [None]:

# Read test solution text file into a DataFrame
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    data = []
    for line in lines:
        line = line.strip()
        if line:
            parts = line.split(':::')
            if len(parts) >= 3:
                title = parts[1].strip()
                genre = parts[2].strip()
                plot = parts[3].strip()
                data.append((title, genre , plot))

    df_sol = pd.DataFrame(data, columns=['Title', 'Genre' ,'Plot'])
    return df_sol

In [None]:
file_path = '/content/drive/MyDrive/encryptix/movie dataset/test_data_solution.txt'  # Replace with your file path
df_sol = read_text_file(file_path)


In [None]:
df_sol

Unnamed: 0,Title,Genre,Plot
0,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
1,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
3,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
4,Er nu zhai (1955),drama,Before he was known internationally as a marti...
...,...,...,...
54195,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Dar..."
54196,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their ...
54197,Oliver Twink (2007),adult,"A movie 169 years in the making. Oliver Twist,..."
54198,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard ..."


In [None]:
# Preprocess plot summaries
df_sol['cleaned_plot'] = df_sol['Plot'].apply(preprocess_text)

# Encode the target labels using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
Q = mlb.fit_transform(df_sol['Genre'].apply(lambda x: [x]))

In [None]:
df_sol

Unnamed: 0,Title,Genre,Plot,cleaned_plot
0,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart...",l r brane love life car apartment job especial...
1,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi...",spain march 1964 quico naughty child three bel...
2,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...,one year life albin family shepherd north tran...
3,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his...",father died spoken brother 10 year serious can...
4,Er nu zhai (1955),drama,Before he was known internationally as a marti...,known internationally martial art superstar br...
...,...,...,...,...
54195,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Dar...",covering multiple genre tale light dark anthol...
54196,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their ...,alice cora munro attempt find father british o...
54197,Oliver Twink (2007),adult,"A movie 169 years in the making. Oliver Twist,...",movie 169 year making oliver twist artful dodg...
54198,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard ...",popular mysterious rock j mike mallard askew b...


In [None]:
Q,y_pred

(array([[0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]))

In [None]:
y_test_labels = mlb.inverse_transform(Q)

accuracy = accuracy_score(Q, y_pred)
report = classification_report(Q, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.014944649446494465
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1314
           1       0.00      0.00      0.00       590
           2       0.00      0.00      0.00       775
           3       0.00      0.00      0.00       498
           4       0.00      0.00      0.00       264
           5       0.16      0.01      0.01      7446
           6       0.00      0.00      0.00       505
           7       0.42      0.05      0.08     13096
           8       0.41      0.01      0.02     13612
           9       0.00      0.00      0.00       783
          10       0.00      0.00      0.00       322
          11       0.00      0.00      0.00       193
          12       0.00      0.00      0.00       243
          13       0.00      0.00      0.00      2204
          14       0.00      0.00      0.00       731
          15       0.00      0.00      0.00       276
          16       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
