# Movie genre classification

In [None]:
import pandas as pd

data = []

with open("train_data.txt", 'r') as f:
    for line in f:
        # Split the line based on delimiters (:::)
        parts = line.strip().split(":::")

        # Extract remaining information (assuming serial number is in the first column)
        serial_no = parts[0]  # Extract serial number from first element
        title_year, genre, description = parts[1:]  # Extract remaining parts from index 1 onwards

        # Append data as a list
        data.append([serial_no, title_year, genre, description])

# Create a DataFrame
df = pd.DataFrame(data, columns=["serial_no", "title_year", "genre", "description"])

# Save as CSV
df.to_csv("train_data.csv", index=False)

In [12]:
import pandas as pd

# Converting text file to csv form
data = []
with open("test_data.txt", 'r') as f:  # Open the file for reading
    for line in f:
        # Split the line, handling potential extra parts
        parts = line.strip().split(":::", maxsplit=2)  # Split at most twice
        # Handle cases with less than 3 parts (fill with empty strings)
        if len(parts) < 3:
            parts += [""] * (3 - len(parts))  # Add empty strings for missing values
        serial_no,title_year,description = parts
        # Append data as a list of lists
        data.append([serial_no,title_year,description])

# Create a DataFrame
df = pd.DataFrame(data, columns=["serial_no","title_year", "description"])

# Save as CSV
df.to_csv("test_data.csv", index=False)

In [16]:
import pandas as pd

# Replace "your_file.txt" with the actual file name
data = []

with open("test_data_solution.txt", 'r') as f:
    for line in f:
        # Split the line based on delimiters (:::)
        parts = line.strip().split(":::")

        # Extract remaining information (assuming serial number is in the first column)
        serial_no = parts[0]  # Extract serial number from first element
        title_year, genre, description = parts[1:]  # Extract remaining parts from index 1 onwards

        # Append data as a list
        data.append([serial_no, title_year, genre, description])

# Create a DataFrame
df = pd.DataFrame(data, columns=["serial_no", "title_year", "genre", "description"])

# Save as CSV
df.to_csv("test_data_solution.csv", index=False)

In [29]:
!pip install nltk
import nltk
nltk.download('stopwords')
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC  # Using Support Vector Machine (SVM)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.util import ngrams

# Load Data
train_data = pd.read_csv("train_data.csv")
plot_summaries_train = train_data['description']
genres_train = train_data['genre']

test_data = pd.read_csv("test_data.csv")
plot_summaries_test = test_data['description']

test_solution = pd.read_csv("test_data_solution.csv")
genres_test_solution = test_solution['genre']

# Print First Five Records
print("Train Data (First 5 Records):")
print(train_data.head())

print("\nTest Data (First 5 Records):")
print(test_data.head())

print("\nTest Data Solution (First 5 Records):")
print(test_solution.head())

# Text Preprocessing

def process_text_data(text):

  # Lowercase conversion
  text = text.lower()

  # Remove punctuation
  import string
  punctuations = string.punctuation
  text = "".join([char for char in text if char not in punctuations])

  #Stop word removal
  from nltk.corpus import stopwords
  stop_words = stopwords.words('english')
  text = [word for word in text.split() if word not in stop_words]
  text = " ".join(text)

  return text

plot_summaries_train = plot_summaries_train.apply(process_text_data)
plot_summaries_test = plot_summaries_test.apply(process_text_data)

# Feature Engineering
# N-gram Features (bigrams as an example)
def generate_ngrams(text, n):

  return ngrams(text.split(), n)

plot_summaries_train_ngrams = []
for summary in plot_summaries_train:
  bigrams = generate_ngrams(summary, 2)
  plot_summaries_train_ngrams.extend(bigrams)

plot_summaries_test_ngrams = []
for summary in plot_summaries_test:
  bigrams = generate_ngrams(summary, 2)
  plot_summaries_test_ngrams.extend(bigrams)

# Combine original text features with n-gram features (optional)
plot_summaries_train_combined = [" ".join(summary) for summary in plot_summaries_train] + [" ".join(gram) for gram in plot_summaries_train_ngrams]
plot_summaries_test_combined = [" ".join(summary) for summary in plot_summaries_test] + [" ".join(gram) for gram in plot_summaries_test_ngrams]

# Vectorization (consider adjusting max_features)
vectorizer = TfidfVectorizer(max_features=1000)
if not plot_summaries_train_combined:  # Use combined features if generated
  plot_summaries_train_vectorized = vectorizer.fit_transform(plot_summaries_train)
  plot_summaries_test_vectorized = vectorizer

# Train the Model (using SVC for SVM)
classifier = SVC()  # Hyperparameter tuning might be needed
classifier.fit(plot_summaries_train_vectorized, genres_train)

# Predict Genres on Test Data
y_pred = classifier.predict(plot_summaries_test_vectorized)

# Print Predictions for First Five Results
print("\nPredicted Genres for First Five Test Data Entries:")
for i in range(5):
  print(f"  - Predicted Genre: {y_pred[i]}")

# Evaluate Model Accuracy
accuracy = accuracy_score(genres_test_solution, y_pred)
print("Model Accuracy:", accuracy)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Train Data (First 5 Records):
   serial_no                          title_year       genre  \
0          1       Oscar et la dame rose (2009)       drama    
1          2                       Cupid (1997)    thriller    
2          3   Young, Wild and Wonderful (1980)       adult    
3          4              The Secret Sin (1915)       drama    
4          5             The Unrecovered (2007)       drama    

                                         description  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  

Test Data (First 5 Records):
   serial_no                     title_year  \
0          1          Edgar's Lunch (1998)    
1          2      La guerra de papá (1977)    
2          3   Off the Beaten Track (2010)    
3          4        Meu Amigo Hindu (