In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, BaggingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [None]:
train = pd.read_csv('./Genre Classification Dataset/train_data.txt', sep=":::", names=["TITLE", "GENRE", "DESCRIPTION"], engine="python")
test = pd.read_csv('./Genre Classification Dataset/test_data.txt', sep=":::", names=["TITLE", "GENRE", "DESCRIPTION"], engine="python")

In [None]:
print(train.shape)
print(test.shape)

In [None]:
print(train.info())
train.head

In [None]:
print(test.info())
print(test.head)

In [None]:
train['GENRE'].value_counts() # Class imbalance is clearly visible

In [None]:
plt.figure(figsize=(10, 6))
train['GENRE'].value_counts().plot(kind='bar')
plt.title('Count of each Genre')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.show()

In [None]:
print(train.dtypes)
print(test.dtypes)

In [None]:
def cleaning(text):
    # Define stopwords set
    stop_words = {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"}
    
    # Convert text to lowercase
    text = text.lower()
    # Remove mentions
    text = ' '.join(word for word in text.split() if not word.startswith('@'))
    # Remove URLs
    text = ' '.join(word for word in text.split() if not word.startswith('http'))
    # Remove .pic occurrences
    text = text.replace('.pic', '')
    # Replace non-alphabetic characters with a space
    text = ''.join(char if char.isalpha() else ' ' for char in text)
    # Remove punctuation
    text = ''.join(char for char in text if char not in '!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
    # Remove stop words and words with length <= 2
    text = ' '.join(word for word in text.split() if word not in stop_words and len(word) > 2)
    # Replace multiple spaces with a single space
    text = ' '.join(text.split())
    return text

train["clean_text"] = train["DESCRIPTION"].apply(cleaning)
test["clean_text"] = test["DESCRIPTION"].apply(cleaning)

In [None]:
train['GENRE'].value_counts()

In [None]:
train.head()

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train["clean_text"])
X_test = vectorizer.transform(test["clean_text"])

In [None]:
y_train = train["GENRE"]
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
print(X_train)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Predict on validation set
y_pred_val = lr.predict(X_val)
val_acc = accuracy_score(y_pred_val, y_val)
print("Validation Accuracy:", val_acc)

In [None]:
# models = [
#     {
#         "name": "SVM",
#         "estimator": SVC(),
#         "param_grid": {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 'auto'], 'kernel': ['linear', 'rbf']},
#     },
#     {
#         "name": "Naive Bayes",
#         "estimator": MultinomialNB(),
#         "param_grid": {'alpha': [0.1, 1, 10]},
#     },
#     {
#         "name": "Decision Tree",
#         "estimator": DecisionTreeClassifier(),
#         "param_grid": {'max_depth': [None, 10, 50], 'min_samples_split': [2, 5, 10]},
#     },
#     {
#         "name": "Random Forest",
#         "estimator": RandomForestClassifier(),
#         "param_grid": {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 50], 'min_samples_split': [2, 5, 10]},
#     },
#     {
#         "name": "AdaBoost",
#         "estimator": AdaBoostClassifier(),
#         "param_grid": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1]},
#     },
#     {
#         "name": "Voting Classifier",
#         "estimator": VotingClassifier(estimators=[
#             ('svm', SVC()), ('nb', MultinomialNB()), ('dt', DecisionTreeClassifier()), ('rf', RandomForestClassifier()), ('ada', AdaBoostClassifier())
#         ]),
#         "param_grid": {'voting': ['hard', 'soft']},
#     },
#     {
#         "name": "Bagging Classifier",
#         "estimator": BaggingClassifier(),
#         "param_grid": {'n_estimators': [10, 50, 100]},
#     },
#     {
#         "name": "XGBoost",
#         "estimator": XGBClassifier(),
#         "param_grid": {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 10], 'learning_rate': [0.01, 0.1, 1]},
#     }
# ]

# # I tried Grid search for each model takes a lot of time becuz size of dataset is big so we just fit it instead
# for model in models:
#     print("Training", model["name"])
#     grid_search = GridSearchCV(model["estimator"], model["param_grid"], cv=5, scoring='accuracy', n_jobs=-1)
#     grid_search.fit(X_train, y_train)
    
#     print("Best Parameters:", grid_search.best_params_)
#     best_model = grid_search.best_estimator_
#     y_pred = best_model.predict(X_val)
#     acc = accuracy_score(y_pred, y_val)
#     print("Validation Accuracy:", acc)

#     # Evaluate on test data
#     test_acc = best_model.score(X_test, test["GENRE"])
#     print("Test Accuracy:", test_acc)
#     print("=" * 50)

In [None]:
# SVM
clf = SVC(kernel = 'rbf')
clf.fit(X_train, y_train)

# Predict on validation set
y_pred_val = clf.predict(X_val)
val_acc = accuracy_score(y_pred_val, y_val)
print("Validation Accuracy:", val_acc)

In [None]:
# Multinomial NB
clf = SVC(kernel = 'rbf')
clf.fit(X_train, y_train)

# Predict on validation set
y_pred_val = clf.predict(X_val)
val_acc = accuracy_score(y_pred_val, y_val)
print("Validation Accuracy:", val_acc)