<a href="https://colab.research.google.com/github/Almamun809/Daily-NLP/blob/main/MNB_for_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB

# Read the training CSV file
#train_data = pd.read_csv('train_file.csv')

# Preprocess the text data for training
train_data['Data'] = train_data['Data'].apply(lambda x: re.sub(r'http\S+', '', x))  # Remove URLs
train_data['Data'] = train_data['Data'].apply(lambda x: re.sub(r'\@\w+', '', x))  # Remove usernames
train_data['Data'] = train_data['Data'].apply(lambda x: re.sub(r'[^\w\s]', '', x))  # Remove punctuations
train_data['Data'] = train_data['Data'].apply(lambda x: re.sub(r'\#\w+', '', x))  # Remove hashtags
train_data['Data'] = train_data['Data'].apply(lambda x: re.sub(r'\s+', ' ', x.strip()))  # Remove whitespace
train_data['Data'] = train_data['Data'].str.lower()  # Convert to lowercase
train_data['Data'] = train_data['Data'].fillna('')  # Replace NaN values with empty strings

# Read the testing CSV file
#test_data = pd.read_csv('test_file.csv')

# Preprocess the text data for testing
test_data['Data'] = test_data['Data'].apply(lambda x: re.sub(r'http\S+', '', x))  # Remove URLs
test_data['Data'] = test_data['Data'].apply(lambda x: re.sub(r'\@\w+', '', x))  # Remove usernames
test_data['Data'] = test_data['Data'].apply(lambda x: re.sub(r'[^\w\s]', '', x))  # Remove punctuations
test_data['Data'] = test_data['Data'].apply(lambda x: re.sub(r'\#\w+', '', x))  # Remove hashtags
test_data['Data'] = test_data['Data'].apply(lambda x: re.sub(r'\s+', ' ', x.strip()))  # Remove whitespace
test_data['Data'] = test_data['Data'].str.lower()  # Convert to lowercase
test_data['Data'] = test_data['Data'].fillna('')  # Replace NaN values with empty strings

# Define the features and target variable for training
X_train = train_data['Data']
y_train = train_data['Label']

# Define the features and target variable for testing
X_test = test_data['Data']
y_test = test_data['Label']

# Define the TfidfVectorizer with character n-gram features of length 5
vectorizer = TfidfVectorizer(ngram_range=(1, 5))

# Convert text data into numerical features for training and testing
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Define the Multinomial Naive Bayes classifier
classifier = MultinomialNB()

# Perform cross-validation on training data
kfold = KFold(n_splits=3, shuffle=True, random_state=36)

# Initialize lists to store evaluation metric scores
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Iterate over the cross-validation folds
for train_index, val_index in kfold.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Fit the classifier on the training fold
    classifier.fit(X_train_fold, y_train_fold)

    # Predict the labels for the validation fold
    y_pred_val = classifier.predict(X_val_fold)

    # Calculate evaluation metric scores for the validation fold
    accuracy_scores.append(accuracy_score(y_val_fold, y_pred_val))
    precision_scores.append(precision_score(y_val_fold, y_pred_val, average='macro', zero_division=1))
    recall_scores.append(recall_score(y_val_fold, y_pred_val, average='macro'))
    f1_scores.append(f1_score(y_val_fold, y_pred_val, average='macro'))

# Calculate average scores across all folds
accuracy_avg = np.mean(accuracy_scores)
precision_avg = np.mean(precision_scores)
recall_avg = np.mean(recall_scores)
f1_avg = np.mean(f1_scores)

# Fit the classifier on the entire training data
classifier.fit(X_train, y_train)

# Predict the labels for the test data
y_pred_test = classifier.predict(X_test)

# Calculate evaluation metric scores for the test data
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test, average='macro', zero_division=1)
recall_test = recall_score(y_test, y_pred_test, average='macro')
f1_test = f1_score(y_test, y_pred_test, average='macro')

# Print the evaluation metrics
print("Cross-validation scores:")
print(f'Accuracy: {accuracy_avg}')
print(f'Precision: {precision_avg}')
print(f'Recall: {recall_avg}')
print(f'F1 Score: {f1_avg}')
print("\nTest set scores:")
print(f'Accuracy: {accuracy_test}')
print(f'Precision: {precision_test}')
print(f'Recall: {recall_test}')
print(f'F1 Score: {f1_test}')
