In [8]:
import pandas as pd

# Define the correct separator
sep = ':::'  # Adjust if necessary

# Load the data
train_data = pd.read_csv('/home/aditya/Desktop/codsoft/task2/Genre Classification Dataset/train_data.txt', sep=sep, engine='python', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], header=None)
test_data = pd.read_csv('/home/aditya/Desktop/codsoft/task2/Genre Classification Dataset/test_data.txt', sep=sep, engine='python', names=['ID', 'TITLE', 'DESCRIPTION'], header=None)
test_data_solution = pd.read_csv('/home/aditya/Desktop/codsoft/task2/Genre Classification Dataset/test_data_solution.txt', sep=sep, engine='python')

# Display the first few rows and column names of the data
print("Train Data Columns and Sample:")
print(train_data.columns)
print(train_data.head())

print("\nTest Data Columns and Sample:")
print(test_data.columns)
print(test_data.head())

print("\nTest Data Solution Columns and Sample:")
print(test_data_solution.columns)
print(test_data_solution.head())


Train Data Columns and Sample:
Index(['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], dtype='object')
   ID                               TITLE       GENRE   
0   1       Oscar et la dame rose (2009)       drama   \
1   2                       Cupid (1997)    thriller    
2   3   Young, Wild and Wonderful (1980)       adult    
3   4              The Secret Sin (1915)       drama    
4   5             The Unrecovered (2007)       drama    

                                         DESCRIPTION  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  

Test Data Columns and Sample:
Index(['ID', 'TITLE', 'DESCRIPTION'], dtype='object')
   ID                          TITLE   
0   1          Edgar's Lunch (1998)   \
1   2      La guerra de papá (1977)    
2   3   Off the Beaten Track

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Column names based on your description
plot_column = 'DESCRIPTION'
genre_column = 'GENRE'

# Handle missing values
train_data[plot_column].fillna('', inplace=True)
test_data[plot_column].fillna('', inplace=True)

# Encode the target variable (genre)
label_encoder = LabelEncoder()
train_data['genre_encoded'] = label_encoder.fit_transform(train_data[genre_column])

# Text vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data[plot_column])
X_test_tfidf = tfidf_vectorizer.transform(test_data[plot_column])

# Features and target variable
X = X_train_tfidf
y = train_data['genre_encoded']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = log_reg.predict(X_val)

# Evaluate the model
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))


Logistic Regression
Accuracy: 0.5787143779396846
               precision    recall  f1-score   support

      action        0.52      0.26      0.35       263
       adult        0.75      0.21      0.33       112
   adventure        0.43      0.14      0.21       139
   animation        0.56      0.09      0.15       104
   biography        0.00      0.00      0.00        61
      comedy        0.51      0.58      0.55      1443
       crime        0.29      0.02      0.04       107
 documentary        0.66      0.84      0.74      2659
       drama        0.54      0.78      0.64      2697
      family        0.39      0.07      0.12       150
     fantasy        0.00      0.00      0.00        74
   game-show        0.94      0.42      0.59        40
     history        0.00      0.00      0.00        45
      horror        0.64      0.56      0.60       431
       music        0.62      0.47      0.53       144
     musical        1.00      0.02      0.04        50
     mystery   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
from sklearn.naive_bayes import MultinomialNB

# Initialize and train the model
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = nb.predict(X_val)

# Evaluate the model
print("Naive Bayes")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))


Naive Bayes
Accuracy: 0.52310246241815
               precision    recall  f1-score   support

      action        0.57      0.08      0.14       263
       adult        0.88      0.06      0.12       112
   adventure        0.29      0.03      0.05       139
   animation        0.00      0.00      0.00       104
   biography        0.00      0.00      0.00        61
      comedy        0.51      0.44      0.47      1443
       crime        0.00      0.00      0.00       107
 documentary        0.58      0.88      0.70      2659
       drama        0.46      0.83      0.59      2697
      family        1.00      0.01      0.01       150
     fantasy        0.00      0.00      0.00        74
   game-show        1.00      0.15      0.26        40
     history        0.00      0.00      0.00        45
      horror        0.73      0.38      0.50       431
       music        0.79      0.10      0.18       144
     musical        0.00      0.00      0.00        50
     mystery        0.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
from sklearn.svm import LinearSVC

# Initialize and train the model
svc = LinearSVC(max_iter=10000)
svc.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = svc.predict(X_val)

# Evaluate the model
print("Support Vector Machine")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))




Support Vector Machine
Accuracy: 0.5686618094623259
               precision    recall  f1-score   support

      action        0.41      0.32      0.36       263
       adult        0.59      0.38      0.46       112
   adventure        0.31      0.22      0.26       139
   animation        0.40      0.18      0.25       104
   biography        0.00      0.00      0.00        61
      comedy        0.51      0.56      0.53      1443
       crime        0.21      0.06      0.09       107
 documentary        0.69      0.81      0.75      2659
       drama        0.56      0.71      0.62      2697
      family        0.24      0.12      0.16       150
     fantasy        0.09      0.01      0.02        74
   game-show        0.71      0.68      0.69        40
     history        0.00      0.00      0.00        45
      horror        0.60      0.63      0.61       431
       music        0.54      0.53      0.53       144
     musical        0.00      0.00      0.00        50
     mystery