## Before you start

Let's make sure that we have access to GPU. We can use `nvidia-smi` command to do that. In case of any problems navigate to `Edit` -> `Notebook settings` -> `Hardware accelerator`, set it to `GPU`, and then click `Save`.

In [None]:
!nvidia-smi

In [2]:
import sys
print(sys.executable)

try:
    import tensorflow as tf
    print("TensorFlow version:", tf.__version__)
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
except ImportError as e:
    print("Error importing TensorFlow:", e)
import torch

# Check if CUDA is available
is_cuda_available = torch.cuda.is_available()
print("Is CUDA available:", is_cuda_available)

# Get the number of CUDA devices
cuda_device_count = torch.cuda.device_count()
print("CUDA device count:", cuda_device_count)

if is_cuda_available:
    for i in range(cuda_device_count):
        print(f"CUDA Device {i}: {torch.cuda.get_device_name(i)}")



c:\Users\AGOMOH\Desktop\CODSOFT\.venv\Scripts\python.exe
Error importing TensorFlow: No module named 'tensorflow'


ModuleNotFoundError: No module named 'torch'

Cell 1: Import necessary Libraries

In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
import time

nltk.download('punkt')
nltk.download('stopwords')

# Ensure output images directory exists
os.makedirs('images', exist_ok=True)




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AGOMOH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AGOMOH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cell 2: Load the dataset

In [4]:
import pandas as pd

# Load the dataset with specified encoding
def load_data(file_path, is_train=True, encoding='utf-8'):
    data = []
    with open(file_path, 'r', encoding=encoding) as file:
        for line in file.readlines():
            parts = line.strip().split(' ::: ')
            if is_train:
                data.append({'ID': parts[0], 'TITLE': parts[1], 'GENRE': parts[2], 'DESCRIPTION': parts[3]})
            else:
                data.append({'ID': parts[0], 'TITLE': parts[1], 'DESCRIPTION': parts[2]})
    return pd.DataFrame(data)

# Specify the file paths
train_data_path = os.path.join('dataset', 'Genre Classification Dataset', 'train_data.txt')
test_data_path = os.path.join('dataset', 'Genre Classification Dataset', 'test_data.txt')
test_solutions_path = os.path.join('dataset', 'Genre Classification Dataset', 'test_data_solution.txt')

# Load the data with UTF-8 encoding
train_data = load_data(train_data_path, is_train=True, encoding='utf-8')
test_data = load_data(test_data_path, is_train=False, encoding='utf-8')
test_solutions = pd.read_csv(test_solutions_path, sep=' ::: ', engine='python', header=None, names=['ID', 'GENRE'], encoding='utf-8')

# If UTF-8 encoding doesn't work, try with 'latin-1'
# train_data = load_data(train_data_path, is_train=True, encoding='latin-1')
# test_data = load_data(test_data_path, is_train=False, encoding='latin-1')
# test_solutions = pd.read_csv(test_solutions_path, sep=' ::: ', header=None, names=['ID', 'GENRE'], encoding='latin-1')



Cell 3: Preprocess the textual data

In [5]:
# Preprocess the textual data
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

train_data['DESCRIPTION'] = train_data['DESCRIPTION'].apply(preprocess_text)
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].apply(preprocess_text)


Cell 4: Vectorize text data

In [7]:
# Vectorize text data
def vectorize_text(train_data, test_data, text_list):
    tfidf_vectorizer = TfidfVectorizer()

    # Combine train, test, and additional text data for vectorization
    combined_data = pd.concat([train_data['DESCRIPTION'], test_data['DESCRIPTION'], pd.Series(text_list)])

    # Fit and transform TF-IDF vectorizer
    X_tfidf = tfidf_vectorizer.fit_transform(combined_data)

    # Split into train and test data
    X_train_tfidf = X_tfidf[:len(train_data)]
    X_test_tfidf = X_tfidf[len(train_data):(len(train_data) + len(test_data))]
    X_text_list_tfidf = X_tfidf[(len(train_data) + len(test_data)):]

    return X_train_tfidf, X_test_tfidf, X_text_list_tfidf, tfidf_vectorizer

text_list = train_data['DESCRIPTION'].tolist()  # Text list used in the original preprocessing step
X_train_tfidf, X_test_tfidf, X_text_list_tfidf, tfidf_vectorizer = vectorize_text(train_data, test_data, text_list)


Cell 5: Split data for training and testing

In [8]:
# Split data for training and testing
y_train = train_data['GENRE']
y_test = test_solutions['GENRE']
# Define genres
genres = train_data['GENRE'].unique()


Cell 6: Model training

In [16]:
# Model training
def train_models(X_train_tfidf, y_train):
    nb_model = MultinomialNB()
    nb_model.fit(X_train_tfidf, y_train)

    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train_tfidf, y_train)

    svm_model = SVC()
    svm_model.fit(X_train_tfidf, y_train)

    return nb_model, lr_model, svm_model

nb_model, lr_model, svm_model = train_models(X_train_tfidf, y_train)


KeyboardInterrupt: 

Cell 7: Model evaluation

In [None]:
# Model evaluation
def evaluate_models(model, X_test_tfidf, y_test, model_name):
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
    plt.xlabel('Predicted Genre')
    plt.ylabel('Actual Genre')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.show()

    # Save the confusion matrix as an image
    plt.savefig(f'images/confusion_matrix_{model_name}.png')

print("Evaluating Naive Bayes Model:")
evaluate_models(nb_model, X_test_tfidf, y_test, 'Naive Bayes')
print("Evaluating Logistic Regression Model:")
evaluate_models(lr_model, X_test_tfidf, y_test, 'Logistic Regression')
print("Evaluating Support Vector Machine Model:")
evaluate_models(svm_model, X_test_tfidf, y_test, 'SVM')


Cell 8: Hyperparameter tuning

In [None]:
# Hyperparameter tuning
def tune_hyperparameters(model, param_grid, X_train_tfidf, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train_tfidf, y_train)
    return grid_search.best_estimator_

nb_param_grid = {'alpha': [0.1, 1.0, 10.0]}
lr_param_grid = {'C': [0.1, 1.0, 10.0], 'max_iter': [100, 500, 1000]}
svm_param_grid = {'C': [0.1, 1.0, 10.0], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}

print("Tuning Naive Bayes Model:")
nb_model = tune_hyperparameters(nb_model, nb_param_grid, X_train_tfidf, y_train)
print("Tuning Logistic Regression Model:")
lr_model = tune_hyperparameters(lr_model, lr_param_grid, X_train_tfidf, y_train)
print("Tuning Support Vector Machine Model:")
svm_model = tune_hyperparameters(svm_model, svm_param_grid, X_train_tfidf, y_train)


Cell 9: Cross-validation

In [None]:
# Cross-validation
def perform_cross_validation(model, X_train_tfidf, y_train):
    cv_scores = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
    print("Cross-Validation Scores:", cv_scores)
    print("Mean CV Accuracy:", np.mean(cv_scores))

print("Cross-Validation for Naive Bayes Model:")
perform_cross_validation(nb_model, X_train_tfidf, y_train)
print("Cross-Validation for Logistic Regression Model:")
perform_cross_validation(lr_model, X_train_tfidf, y_train)
print("Cross-Validation for Support Vector Machine Model:")
perform_cross_validation(svm_model, X_train_tfidf, y_train)


Cell 10: Train ensemble models

In [None]:
# Train ensemble models
def train_ensemble_models(X_train_tfidf, y_train):
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train_tfidf, y_train)

    gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
    gb_model.fit(X_train_tfidf, y_train)

    return rf_model, gb_model

rf_model, gb_model = train_ensemble_models(X_train_tfidf, y_train)


Cell 11: Evaluate ensemble models

In [None]:
print("Evaluating Random Forest Model:")
evaluate_models(rf_model, X_test_tfidf, y_test, 'Random Forest')
print("Evaluating Gradient Boosting Model:")
evaluate_models(gb_model, X_test_tfidf, y_test, 'Gradient Boosting')


Cell 12: Visualize test predictions

In [None]:
# Visualize test predictions
def visualize_test_predictions(model, X_test_tfidf, tfidf_vectorizer, genres):
    y_pred = model.predict(X_test_tfidf)

    predictions_df = pd.DataFrame({
        'Description': tfidf_vectorizer.inverse_transform(X_test_tfidf),
        'Predicted Genre': y_pred,
        'Actual Genre': y_test
    })

    # Select a sample for visualization
    sample_predictions = predictions_df.sample(10)

    for idx, row in sample_predictions.iterrows():
        print(f"Description: {' '.join(row['Description'])}")
        print(f"Predicted Genre: {row['Predicted Genre']}")
        print(f"Actual Genre: {row['Actual Genre']}")
        print("\n")

    # Save the sample predictions as an image
    fig, ax = plt.subplots()
    ax.axis('tight')
    ax.axis('off')
    table = ax.table(cellText=sample_predictions.values, colLabels=sample_predictions.columns, cellLoc='center', loc='center')
    plt.savefig(f'images/sample_predictions_{model.__class__.__name__}.png')

print("Visualizing Test Predictions for Naive Bayes Model:")
visualize_test_predictions(nb_model, X_test_tfidf, tfidf_vectorizer, genres)
print("Visualizing Test Predictions for Logistic Regression Model:")
visualize_test_predictions(lr_model, X_test_tfidf, tfidf_vectorizer, genres)
print("Visualizing Test Predictions for Support Vector Machine Model:")
visualize_test_predictions(svm_model, X_test_tfidf, tfidf_vectorizer, genres)
print("Visualizing Test Predictions for Random Forest Model:")
visualize_test_predictions(rf_model, X_test_tfidf, tfidf_vectorizer, genres)
print("Visualizing Test Predictions for Gradient Boosting Model:")
visualize_test_predictions(gb_model, X_test_tfidf, tfidf_vectorizer, genres)


Cell 13: Save models and vectorizer

In [None]:
# Save models and vectorizer
def save_model(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

save_model(nb_model, 'nb_model.pkl')
save_model(lr_model, 'lr_model.pkl')
save_model(svm_model, 'svm_model.pkl')
save_model(rf_model, 'rf_model.pkl')
save_model(gb_model, 'gb_model.pkl')
save_model(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
