In [None]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score,precision_score, recall_score, f1_score, confusion_matrix
import gensim
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Load the dataset
tweets = pd.read_csv("Multi_Labeled_Data.csv",usecols=['Tweet_Text', 'Label'])

# Preprocess the tweets (optional)
def preprocess_text(text):
    # Apply any text preprocessing steps you want, like lowercasing, stemming, etc.
    return text.lower()

preprocessed_tweets = [preprocess_text(tweet) for tweet in tweets['Tweet_Text']]
y =tweets['Label']


# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(preprocessed_tweets)

# Initialize the KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the Naive Bayes classifier
clf = MultinomialNB()

# Perform cross-validation
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
for train_index, test_index in kf.split(X_tfidf):
    X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the Naive Bayes classifier
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    precision= precision_score(y_test, y_pred,average='weighted')
    precision_scores.append(precision)
    recall= recall_score(y_test, y_pred,average='weighted')
    recall_scores.append(recall)
    f1 = f1_score(y_test, y_pred,average='weighted')
    f1_scores.append(f1)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Calculate the average accuracy score
avg_accuracy = np.mean(accuracy_scores)
print("Average accuracy: ", avg_accuracy)
avg_precision = np.mean(precision_scores)
print("Average precision: ", avg_precision)
avg_recall = np.mean(recall_scores)
print("Average recall: ", avg_recall)
avg_f1 = np.mean(f1_scores)
print("Average f1-measure: ", avg_f1)


In [None]:
# Define the classifiers
logreg = LogisticRegression(max_iter=1000)
svm = SVC(kernel='linear')
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
dt = DecisionTreeClassifier()

classifiers = [logreg, svm, rf, gb, dt]
for clf in classifiers:
    # Perform cross-validation
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    for train_index, test_index in kf.split(X_tfidf):
        X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the Naive Bayes classifier
        clf.fit(X_train, y_train)

        # Make predictions
        y_pred = clf.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores.append(accuracy)
        precision= precision_score(y_test, y_pred,average='weighted')
        precision_scores.append(precision)
        recall= recall_score(y_test, y_pred,average='weighted')
        recall_scores.append(recall)
        f1 = f1_score(y_test, y_pred,average='weighted')
        f1_scores.append(f1)
        print(f'{clf.__class__.__name__} Classification Report: ')
        print(classification_report(y_test, y_pred))

    # Calculate the average accuracy score
    avg_accuracy = np.mean(accuracy_scores)
#     print("Average accuracy: ", avg_accuracy)
    avg_precision = np.mean(precision_scores)
#     print("Average precision: ", avg_precision)
    avg_recall = np.mean(recall_scores)
#     print("Average recall: ", avg_recall)
    avg_f1 = np.mean(f1_scores)
#     print("Average f1-measure: ", avg_f1)
    print(f'{clf.__class__.__name__} accuracy: {avg_accuracy:.4f}, Precision: {avg_precision:.4f},  Recall: {avg_recall:.4f},  f1-score: {avg_f1:.4f}')




In [None]:
#using Word2Vec Technique
# Define the classifiers
logreg = LogisticRegression(max_iter=1000)
svm = SVC(kernel='linear')
nb = MultinomialNB()
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
dt = DecisionTreeClassifier()

classifiers = [gb, dt, logreg, svm, rf]

sentences = [tweet.split() for tweet in preprocessed_tweets]
model = gensim.models.Word2Vec(sentences, min_count=1, vector_size=200)
X = []
for tweet in sentences:
    vec = np.zeros(200)
    count = 0
    for word in tweet:
        try:
            vec += model.wv[word]
            count += 1
        except:
            pass
    vec /= count
    X.append(vec)
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for clf in classifiers:
    # Perform cross-validation
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = np.array(X)[train_index], np.array(X)[test_index]
        y_train, y_test = np.array(y)[train_index], np.array(y)[test_index]

        # Train the Naive Bayes classifier
        clf.fit(X_train, y_train)

        # Make predictions
        y_pred = clf.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores.append(accuracy)
        precision= precision_score(y_test, y_pred,average='weighted')
        precision_scores.append(precision)
        recall= recall_score(y_test, y_pred,average='weighted')
        recall_scores.append(recall)
        f1 = f1_score(y_test, y_pred,average='weighted')
        f1_scores.append(f1)
        print(f'{clf.__class__.__name__} Classification Report: ')
        print(classification_report(y_test, y_pred))

    # Calculate the average accuracy score
    avg_accuracy = np.mean(accuracy_scores)
#     print("Average accuracy: ", avg_accuracy)
    avg_precision = np.mean(precision_scores)
#     print("Average precision: ", avg_precision)
    avg_recall = np.mean(recall_scores)
#     print("Average recall: ", avg_recall)
    avg_f1 = np.mean(f1_scores)
#     print("Average f1-measure: ", avg_f1)
    print(f'{clf.__class__.__name__} accuracy: {avg_accuracy:.4f}, Precision: {avg_precision:.4f},  Recall: {avg_recall:.4f},  f1-score: {avg_f1:.4f}')




In [None]:
classifiers = [nb]

sentences = [tweet.split() for tweet in preprocessed_tweets]
model = gensim.models.Word2Vec(sentences, min_count=1, vector_size=200)
X = []
for tweet in sentences:
    vec = np.zeros(200)
    count = 0
    for word in tweet:
        try:
            vec += model.wv[word]
            count += 1
        except:
            pass
    vec /= count
    X.append(vec)
X = np.array(X) - np.array(X).min()
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for clf in classifiers:
    # Perform cross-validation
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = np.array(X)[train_index], np.array(X)[test_index]
        y_train, y_test = np.array(y)[train_index], np.array(y)[test_index]

        # Train the Naive Bayes classifier
        clf.fit(X_train, y_train)

        # Make predictions
        y_pred = clf.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores.append(accuracy)
        precision= precision_score(y_test, y_pred,average='weighted')
        precision_scores.append(precision)
        recall= recall_score(y_test, y_pred,average='weighted')
        recall_scores.append(recall)
        f1 = f1_score(y_test, y_pred,average='weighted')
        f1_scores.append(f1)
        print(f'{clf.__class__.__name__} Classification Report: ')
        print(classification_report(y_test, y_pred))

    # Calculate the average accuracy score
    avg_accuracy = np.mean(accuracy_scores)
#     print("Average accuracy: ", avg_accuracy)
    avg_precision = np.mean(precision_scores)
#     print("Average precision: ", avg_precision)
    avg_recall = np.mean(recall_scores)
#     print("Average recall: ", avg_recall)
    avg_f1 = np.mean(f1_scores)
#     print("Average f1-measure: ", avg_f1)
    print(f'{clf.__class__.__name__} accuracy: {avg_accuracy:.4f}, Precision: {avg_precision:.4f},  Recall: {avg_recall:.4f},  f1-score: {avg_f1:.4f}')



In [None]:
from gensim.models import KeyedVectors
from tensorflow.keras.preprocessing.sequence import pad_sequences


#using FastText Word Embedding
# Load the pre-trained FastText embeddings
embedding_path = 'cc.hi.300.vec.gz'
embedding_model = KeyedVectors.load_word2vec_format(embedding_path, binary=False)
max_length=200


# Define the classifiers
logreg = LogisticRegression(max_iter=1000)
nb = MultinomialNB()
svm = SVC(kernel='linear')
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
dt = DecisionTreeClassifier(random_state=42)


# Generate embeddings for the input texts
embedding_size = embedding_model.vector_size
input_embeddings = []
y = tweets['Label']

for text in preprocessed_tweets:
    text_embeddings = []
    for word in text:
        if word in embedding_model.key_to_index:
            text_embeddings.append(embedding_model[word])
        else:
            text_embeddings.append(np.zeros(embedding_size))
    input_embeddings.append(text_embeddings)

padded_embeddings = pad_sequences(input_embeddings, maxlen=max_length, dtype='float32', padding='post', truncating='post', value=np.zeros(embedding_size))
X = padded_embeddings.reshape(padded_embeddings.shape[0], -1)
y = np.array(y)
classifiers = [logreg, svm, rf, dt, gb]
for clf in classifiers:
    y_pred = cross_val_predict(clf, X, y, cv=5)
    acc = accuracy_score(y, y_pred)
    f1 = f1_score(y, y_pred, average='weighted')
    prec = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    print(f'{clf.__class__.__name__} accuracy: {acc:.4f}, precision: {prec:.4f}, recall: {recall:.4f},  f1-score: {f1:.4f}')

