In [None]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')
train_file_path = '/content/drive/MyDrive/movie/train_data.txt'
test_file_path = '/content/drive/MyDrive/movie/test_data.txt'
test_solution_file_path = '/content/drive/MyDrive/movie/test_data_solution.txt'

train_data = pd.read_csv(train_file_path, sep=' ::: ', engine='python', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])
test_data = pd.read_csv(test_file_path, sep=' ::: ', engine='python', names=['ID', 'TITLE', 'DESCRIPTION'])
test_data_solution = pd.read_csv(test_solution_file_path, sep=' ::: ', engine='python', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])

print("Train Data:")
print(train_data.head())
print("\nTest Data:")
print(test_data.head())
print("\nTest Data Solution:")
print(test_data_solution.head())


Mounted at /content/drive
Train Data:
   ID                             TITLE     GENRE  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                         DESCRIPTION  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  

Test Data:
   ID                        TITLE  \
0   1         Edgar's Lunch (1998)   
1   2     La guerra de papá (1977)   
2   3  Off the Beaten Track (2010)   
3   4       Meu Amigo Hindu (2015)   
4   5            Er nu zhai (1955)   

                                         DESCRIPTION  
0  L.R. Brane lov

In [None]:
# Combine TITLE and DESCRIPTION into a single text column
train_data['TEXT'] = train_data['TITLE'] + ' ' + train_data['DESCRIPTION']
test_data['TEXT'] = test_data['TITLE'] + ' ' + test_data['DESCRIPTION']
test_data_solution['TEXT'] = test_data_solution['TITLE'] + ' ' + test_data_solution['DESCRIPTION']

# Drop unnecessary columns
train_data = train_data.drop(columns=['TITLE', 'DESCRIPTION'])
test_data = test_data.drop(columns=['TITLE', 'DESCRIPTION'])
test_data_solution = test_data_solution.drop(columns=['TITLE', 'DESCRIPTION'])

# Handle missing values
train_data = train_data.dropna(subset=['GENRE', 'TEXT'])
test_data = test_data.dropna(subset=['TEXT'])
test_data_solution = test_data_solution.dropna(subset=['GENRE', 'TEXT'])

# Display the first few rows of each DataFrame after preprocessing
print("Train Data After Preprocessing:")
print(train_data.head())
print("\nTest Data After Preprocessing:")
print(test_data.head())
print("\nTest Data Solution After Preprocessing:")
print(test_data_solution.head())

Train Data After Preprocessing:
   ID     GENRE                                               TEXT
0   1     drama  Oscar et la dame rose (2009) Listening in to a...
1   2  thriller  Cupid (1997) A brother and sister with a past ...
2   3     adult  Young, Wild and Wonderful (1980) As the bus em...
3   4     drama  The Secret Sin (1915) To help their unemployed...
4   5     drama  The Unrecovered (2007) The film's title refers...

Test Data After Preprocessing:
   ID                                               TEXT
0   1  Edgar's Lunch (1998) L.R. Brane loves his life...
1   2  La guerra de papá (1977) Spain, March 1964: Qu...
2   3  Off the Beaten Track (2010) One year in the li...
3   4  Meu Amigo Hindu (2015) His father has died, he...
4   5  Er nu zhai (1955) Before he was known internat...

Test Data Solution After Preprocessing:
   ID        GENRE                                               TEXT
0   1     thriller  Edgar's Lunch (1998) L.R. Brane loves his life...
1   2      

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TF-IDF to convert text data into numerical features
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = tfidf_vectorizer.fit_transform(train_data['TEXT'])
y_train = train_data['GENRE']

# Transform the test data
X_test = tfidf_vectorizer.transform(test_data['TEXT'])
X_test_solution = tfidf_vectorizer.transform(test_data_solution['TEXT'])
y_test_solution = test_data_solution['GENRE']

# Save the TF-IDF model and the transformed data for reuse
import joblib

joblib.dump(tfidf_vectorizer, '/content/drive/MyDrive/movie/tfidf_vectorizer.pkl')
joblib.dump(X_train, '/content/drive/MyDrive/movie/X_train.pkl')
joblib.dump(y_train, '/content/drive/MyDrive/movie/y_train.pkl')
joblib.dump(X_test, '/content/drive/MyDrive/movie/X_test.pkl')
joblib.dump(X_test_solution, '/content/drive/MyDrive/movie/X_test_solution.pkl')
joblib.dump(y_test_solution, '/content/drive/MyDrive/movie/y_test_solution.pkl')

print("Feature extraction completed and saved.")

Feature extraction completed and saved.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Split data into training and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_split, y_train_split)

# Save the trained model
joblib.dump(model, '/content/drive/MyDrive/movie/logistic_regression_model.pkl')

# Predict on validation data
y_val_pred = model.predict(X_val)

# Evaluate the model
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

# Predict on test data
y_test_pred = model.predict(X_test_solution)

# Evaluate the model on test data
print("Test Accuracy:", accuracy_score(y_test_solution, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test_solution, y_test_pred))


Validation Accuracy: 0.5863690860462971


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Classification Report:
               precision    recall  f1-score   support

      action       0.50      0.27      0.35       263
       adult       0.72      0.23      0.35       112
   adventure       0.44      0.14      0.22       139
   animation       0.67      0.08      0.14       104
   biography       0.00      0.00      0.00        61
      comedy       0.52      0.58      0.55      1443
       crime       0.38      0.03      0.05       107
 documentary       0.67      0.85      0.75      2659
       drama       0.54      0.78      0.64      2697
      family       0.38      0.08      0.13       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.95      0.45      0.61        40
     history       0.00      0.00      0.00        45
      horror       0.68      0.59      0.63       431
       music       0.61      0.48      0.54       144
     musical       1.00      0.04      0.08        50
     mystery       1.00      0.02      0.04   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Test Classification Report:
               precision    recall  f1-score   support

      action       0.50      0.27      0.35      1314
       adult       0.62      0.21      0.31       590
   adventure       0.61      0.18      0.28       775
   animation       0.51      0.05      0.08       498
   biography       0.00      0.00      0.00       264
      comedy       0.53      0.58      0.55      7446
       crime       0.40      0.03      0.06       505
 documentary       0.66      0.85      0.75     13096
       drama       0.54      0.77      0.64     13612
      family       0.50      0.09      0.15       783
     fantasy       0.70      0.02      0.04       322
   game-show       0.91      0.47      0.62       193
     history       0.00      0.00      0.00       243
      horror       0.65      0.58      0.61      2204
       music       0.69      0.45      0.54       731
     musical       0.29      0.02      0.03       276
     mystery       0.40      0.01      0.02       31

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import joblib
import pickle

# Load the trained model with full path
logistic_regression = joblib.load('/content/drive/MyDrive/movie/logistic_regression_model.pkl')

# Load the TF-IDF vectorizer
with open('/content/drive/MyDrive/movie/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = joblib.load(f)

In [None]:
def predict_genre(plot, model, vectorizer):
    # Transform the plot using the loaded TF-IDF vectorizer
    plot_transformed = vectorizer.transform([plot])
    # Predict the genre using the loaded model
    predicted_genre = model.predict(plot_transformed)
    return predicted_genre[0]

In [None]:
plot1 = "In a post-apocalyptic world, a family struggles to survive against mutated creatures that hunt by sound."
predicted_genre1 = predict_genre(plot1, logistic_regression, tfidf_vectorizer)
print(f"Plot 1: {plot1}")
print(f"Predicted Genre (Logistic Regression): {predicted_genre1}")
print('-' * 80)

Plot 1: In a post-apocalyptic world, a family struggles to survive against mutated creatures that hunt by sound.
Predicted Genre (Logistic Regression): drama
--------------------------------------------------------------------------------


In [None]:
plot2 = "In 2154, astronauts explore a distant exoplanet to find a new home for humanity, encountering alien life forms and advanced technology."
predicted_genre2 = predict_genre(plot2, logistic_regression, tfidf_vectorizer)
print(f"Plot 2: {plot2}")
print(f"Predicted Genre (Logistic Regression): {predicted_genre2}")

Plot 2: In 2154, astronauts explore a distant exoplanet to find a new home for humanity, encountering alien life forms and advanced technology.
Predicted Genre (Logistic Regression): sci-fi


In [None]:
plot3 = "A group of mismatched friends start a pet-sitting business, leading to hilarious mishaps and quirky client encounters."
predicted_genre3 = predict_genre(plot3, logistic_regression, tfidf_vectorizer)
print(f"Plot 3: {plot3}")
print(f"Predicted Genre (Logistic Regression): {predicted_genre3}")

Plot 3: A group of mismatched friends start a pet-sitting business, leading to hilarious mishaps and quirky client encounters.
Predicted Genre (Logistic Regression): comedy
