In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import nltk


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
training_file = "/content/drive/MyDrive/Genre Classification Dataset/train_data.txt"

# Read the text file line by line and parse the data
data = []
with open(training_file, "r") as file:
    for line in file:
        parts = line.strip().split(":::")
        if len(parts) == 4:  # Ensure that the line contains all expected fields
            movie_id, title, genre, plot_summary = parts
            data.append([movie_id.strip(), title.strip(), genre.strip(), plot_summary.strip()])
        else:
            print("Invalid line:", line.strip())

# Print the length of the data list
print("Number of data points:", len(data))

# Create a DataFrame
train_data = pd.DataFrame(data, columns=["movie_id", "movie_title", "genre", "plot_summary"])

# Display the DataFrame
print(train_data.head())
print(train_data.columns)




Number of data points: 54214
  movie_id                       movie_title     genre  \
0        1      Oscar et la dame rose (2009)     drama   
1        2                      Cupid (1997)  thriller   
2        3  Young, Wild and Wonderful (1980)     adult   
3        4             The Secret Sin (1915)     drama   
4        5            The Unrecovered (2007)     drama   

                                        plot_summary  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  
Index(['movie_id', 'movie_title', 'genre', 'plot_summary'], dtype='object')


In [17]:
testing_file = "/content/drive/MyDrive/Genre Classification Dataset/test_data.txt"

# Read the text file line by line and parse the data
test_data = []
with open(testing_file , "r") as file:
    for line in file:
        parts = line.strip().split(":::")
        if len(parts) == 3:  # Ensure that the line contains all expected fields
            movie_id, title, plot_summary = parts
            test_data.append([movie_id.strip(), title.strip(), plot_summary.strip()])
        else:
            print("Invalid line:", line.strip())

# Print the length of the test data list
print("Number of test data points:", len(test_data))

# Create a DataFrame for test data
test_df = pd.DataFrame(test_data, columns=["movie_id", "movie_title", "plot_summary"])

# Display the DataFrame
print(test_df.head())
print(test_df.columns)


Number of test data points: 54200
  movie_id                  movie_title  \
0        1         Edgar's Lunch (1998)   
1        2     La guerra de papá (1977)   
2        3  Off the Beaten Track (2010)   
3        4       Meu Amigo Hindu (2015)   
4        5            Er nu zhai (1955)   

                                        plot_summary  
0  L.R. Brane loves his life - his car, his apart...  
1  Spain, March 1964: Quico is a very naughty chi...  
2  One year in the life of Albin and his family o...  
3  His father has died, he hasn't spoken with his...  
4  Before he was known internationally as a marti...  
Index(['movie_id', 'movie_title', 'plot_summary'], dtype='object')


In [18]:
testing_solutuion_file = "/content/drive/MyDrive/Genre Classification Dataset/test_data_solution.txt"
# Read the text file line by line and parse the data
test_solution_data = []
with open(testing_solutuion_file, "r") as file:
    for line in file:
        parts = line.strip().split(":::")
        if len(parts) == 4:  # Ensure that the line contains all expected fields
            movie_id, title, genre, plot_summary = parts
            test_solution_data.append([movie_id.strip(), title.strip(), genre.strip(), plot_summary.strip()])
        else:
            print("Invalid line:", line.strip())

# Print the length of the test solution data list
print("Number of test solution data points:", len(test_solution_data))

# Create a DataFrame for test solution data
test_solution_df = pd.DataFrame(test_solution_data, columns=["movie_id", "movie_title", "genre", "plot_summary"])

# Display the DataFrame
print(test_solution_df.head())
print(test_solution_df.columns)


Number of test solution data points: 54200
  movie_id                  movie_title        genre  \
0        1         Edgar's Lunch (1998)     thriller   
1        2     La guerra de papá (1977)       comedy   
2        3  Off the Beaten Track (2010)  documentary   
3        4       Meu Amigo Hindu (2015)        drama   
4        5            Er nu zhai (1955)        drama   

                                        plot_summary  
0  L.R. Brane loves his life - his car, his apart...  
1  Spain, March 1964: Quico is a very naughty chi...  
2  One year in the life of Albin and his family o...  
3  His father has died, he hasn't spoken with his...  
4  Before he was known internationally as a marti...  
Index(['movie_id', 'movie_title', 'genre', 'plot_summary'], dtype='object')


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_test = test_solution_df['plot_summary'].values
X_train = train_data['plot_summary'].values
y_train = train_data['genre'].values
y_test = test_solution_df['genre'].values


# Fit the vectorizer on the training data and transform the plot summaries into TF-IDF vectors
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the plot summaries in the test data into TF-IDF vectors using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Print the shape of the TF-IDF matrices
print("Shape of TF-IDF matrix for training data:", X_train_tfidf.shape)
print("Shape of TF-IDF matrix for test data:", X_test_tfidf.shape)

Shape of TF-IDF matrix for training data: (54214, 121110)
Shape of TF-IDF matrix for test data: (54200, 121110)


In [22]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression classifier
lr_classifier = LogisticRegression(max_iter=3000)  # Increase max_iter if needed

# Train the classifier using the TF-IDF vectors and target labels
lr_classifier.fit(X_train_tfidf, y_train)


In [23]:
# Predict the labels for the test data
y_pred_lr = lr_classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Accuracy (Logistic Regression):", accuracy_lr)

# Generate classification report
print("Classification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_lr))


Accuracy (Logistic Regression): 0.5907749077490775
Classification Report (Logistic Regression):


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      action       0.52      0.25      0.34      1314
       adult       0.65      0.20      0.31       590
   adventure       0.74      0.15      0.25       775
   animation       0.63      0.02      0.05       498
   biography       0.00      0.00      0.00       264
      comedy       0.54      0.60      0.57      7446
       crime       0.44      0.02      0.04       505
 documentary       0.67      0.88      0.76     13096
       drama       0.54      0.80      0.64     13612
      family       0.51      0.06      0.11       783
     fantasy       0.60      0.01      0.02       322
   game-show       0.91      0.48      0.63       193
     history       0.00      0.00      0.00       243
      horror       0.67      0.56      0.61      2204
       music       0.69      0.41      0.52       731
     musical       1.00      0.01      0.01       276
     mystery       0.00      0.00      0.00       318
        news       0.78    

  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the parameter grid to search
param_grid_lr = {
    'C': [0.1, 1, 10],        # Regularization parameter
    'solver': ['liblinear', 'saga'],  # Algorithm to use in the optimization problem
    'max_iter': [100, 500, 1000]  # Maximum number of iterations
}

# Initialize the logistic regression classifier
lr_classifier = LogisticRegression()

# Initialize the grid search with the classifier and parameter grid
grid_search_lr = GridSearchCV(lr_classifier, param_grid_lr, cv=5, n_jobs=-1)

# Perform grid search to find the best hyperparameters
grid_search_lr.fit(X_train_tfidf, y_train)

# Print the best hyperparameters found
print("Best hyperparameters:", grid_search_lr.best_params_)

# Get the best model
best_lr_classifier = grid_search_lr.best_estimator_

# Predict the labels for the test data using the best model
y_pred_best_lr = best_lr_classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy_best_lr = accuracy_score(y_test, y_pred_best_lr)
print("Accuracy (Best Model - Logistic Regression):", accuracy_best_lr)

# Generate classification report for the best model
print("Classification Report (Best Model - Logistic Regression):")
print(classification_report(y_test, y_pred_best_lr))


Best hyperparameters: {'C': 10, 'max_iter': 100, 'solver': 'liblinear'}
Accuracy (Best Model - Logistic Regression): 0.5983579335793358
Classification Report (Best Model - Logistic Regression):
              precision    recall  f1-score   support

      action       0.47      0.34      0.39      1314
       adult       0.67      0.37      0.47       590
   adventure       0.58      0.21      0.31       775
   animation       0.49      0.10      0.17       498
   biography       0.00      0.00      0.00       264
      comedy       0.56      0.60      0.58      7446
       crime       0.32      0.06      0.10       505
 documentary       0.70      0.85      0.76     13096
       drama       0.56      0.75      0.64     13612
      family       0.45      0.14      0.21       783
     fantasy       0.35      0.07      0.12       322
   game-show       0.90      0.60      0.72       193
     history       0.50      0.01      0.02       243
      horror       0.66      0.61      0.63      