<h3>Cleaning and Preprocessing</h3>

In [47]:
#import
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import regex as re
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [62]:
# Define the column names
column_names = ['tweet_id', 'entity', 'sentiment', 'content']

dataset_path = 'twitter_training.csv' 
eval_data_path = 'twitter_evaluation.csv'
tweets_df = pd.read_csv(dataset_path, header=None, names=column_names)
eval_df = pd.read_csv(dataset_path, header=None, names=column_names)

df.head()

Unnamed: 0,tweet_id,entity,sentiment,content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [63]:
print("Before: ")

print("null values: ", tweets_df.isnull().sum())
print("duplicates: ", tweets_df.duplicated().sum())

tweets_df.dropna(inplace=True)
tweets_df.drop_duplicates(inplace=True)
tweets_df.drop('tweet_id', axis=1, inplace=True)

#for evaluation data
eval_df.dropna(inplace=True)
eval_df.drop_duplicates(inplace=True)
eval_df.drop('tweet_id', axis=1, inplace=True)

Before: 
null values:  tweet_id       0
entity         0
sentiment      0
content      686
dtype: int64
duplicates:  2700


In [64]:
# Remove Non-String
def filter_non_string(df, column):
    df = df.dropna(subset=[column])
    df[column] = df[column].astype(str)
    return df

# Convert In LowerCase
def normalize_text(text):
    """Convert text to lowercase to ensure consistency across the corpus."""
    return text.lower()

# Remove HTML Tags
def remove_html_tags(text):
    """Remove HTML tags from text."""
    return re.sub(r'<.*?>', '', text)

# Remove URL Or HyperLink
def remove_urls(text):
    """Remove URLs or hyperlinks from the text."""
    return re.sub(r'http\S+|www\S+', '', text)

# Remove Punctuation
def remove_punctuation(text):
    """Remove punctuation marks from the text."""
    return text.translate(str.maketrans('', '', string.punctuation))

# Split Text In Token
def tokenize_text(text):
    """Split the text into individual words or tokens."""
    return word_tokenize(text)

# Eliminate Stopwords
def remove_stopwords(tokens):
    """Eliminate common stopwords from the tokenized text."""
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

# Remove Emojis
def remove_emojis(text):
    """Remove emojis from the text."""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Stem Tokens
def stem_tokens(tokens):
    """Apply stemming to the tokenized text."""
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

# Lemmatize Tokens
def lemmatize_tokens(tokens):
    """Apply lemmatization to the tokenized text."""
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

def preprocess_text(df):
    df = filter_non_string(df, 'content')
    df['cleaned_text'] = df['content'].apply(normalize_text)
    df['cleaned_text'] = df['cleaned_text'].apply(remove_html_tags)
    df['cleaned_text'] = df['cleaned_text'].apply(remove_urls)
    df['cleaned_text'] = df['cleaned_text'].apply(remove_punctuation)
    df['cleaned_text'] = df['cleaned_text'].apply(remove_emojis)
    df['cleaned_text'] = df['cleaned_text'].apply(tokenize_text)
    df['cleaned_text'] = df['cleaned_text'].apply(remove_stopwords)
    #df['cleaned_text'] = df['cleaned_text'].apply(stem_tokens)
    df['cleaned_text'] = df['cleaned_text'].apply(lemmatize_tokens)
    # Optionally, join the tokens back into a string if needed
    df['cleaned_text'] = df['cleaned_text'].apply(lambda tokens: ' '.join(tokens))
    return df

# Usage:
data_processed = preprocess_text(tweets_df)
eval_processed = preprocess_text(eval_df)
print(data_processed.head())

        entity sentiment                                            content  \
0  Borderlands  Positive  im getting on borderlands and i will murder yo...   
1  Borderlands  Positive  I am coming to the borders and I will kill you...   
2  Borderlands  Positive  im getting on borderlands and i will kill you ...   
3  Borderlands  Positive  im coming on borderlands and i will murder you...   
4  Borderlands  Positive  im getting on borderlands 2 and i will murder ...   

                     cleaned_text  
0    im getting borderland murder  
1              coming border kill  
2      im getting borderland kill  
3     im coming borderland murder  
4  im getting borderland 2 murder  


In [54]:
data_processed

Unnamed: 0,entity,sentiment,content,cleaned_text
0,Borderlands,Positive,im getting on borderlands and i will murder yo...,im getting borderland murder
1,Borderlands,Positive,I am coming to the borders and I will kill you...,coming border kill
2,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting borderland kill
3,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming borderland murder
4,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting borderland 2 murder
...,...,...,...,...
74677,Nvidia,Positive,Just realized that the Windows partition of my...,realized window partition mac like 6 year behi...
74678,Nvidia,Positive,Just realized that my Mac window partition is ...,realized mac window partition 6 year behind nv...
74679,Nvidia,Positive,Just realized the windows partition of my Mac ...,realized window partition mac 6 year behind nv...
74680,Nvidia,Positive,Just realized between the windows partition of...,realized window partition mac like 6 year behi...


In [65]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le = LabelEncoder()

# Encode the 'Entity' feature
data_processed['entity'] = le.fit_transform(data_processed['entity'])
eval_processed['entity'] = le.fit_transform(eval_processed['entity'])

# Encode the 'Sentiment' target variable
data_processed['sentiment'] = le.fit_transform(data_processed['sentiment'])
eval_processed['sentiment'] = le.fit_transform(eval_processed['sentiment'])

In [66]:
data_processed.head()

Unnamed: 0,entity,sentiment,content,cleaned_text
0,4,3,im getting on borderlands and i will murder yo...,im getting borderland murder
1,4,3,I am coming to the borders and I will kill you...,coming border kill
2,4,3,im getting on borderlands and i will kill you ...,im getting borderland kill
3,4,3,im coming on borderlands and i will murder you...,im coming borderland murder
4,4,3,im getting on borderlands 2 and i will murder ...,im getting borderland 2 murder


In [70]:
#saving the preprocessed data
data_processed.to_csv('preprocessed_tweets.csv', index=False)
eval_processed.to_csv('processed_evaluation_tweets.csv', index=False)

<h3>Encoding</h3>

In [68]:
#importing the libraries for encoding
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np

In [71]:
#importing the preprocessed data
preprocessed_df = pd.read_csv('preprocessed_tweets.csv')
eval_processed = pd.read_csv('processed_evaluation_tweets.csv')

preprocessed_df.head()

Unnamed: 0,entity,sentiment,content,cleaned_text
0,4,3,im getting on borderlands and i will murder yo...,im getting borderland murder
1,4,3,I am coming to the borders and I will kill you...,coming border kill
2,4,3,im getting on borderlands and i will kill you ...,im getting borderland kill
3,4,3,im coming on borderlands and i will murder you...,im coming borderland murder
4,4,3,im getting on borderlands 2 and i will murder ...,im getting borderland 2 murder


In [61]:
#Bag of words
def encode_bow(corpus):
    vectorizer = CountVectorizer()
    bow_vectors = vectorizer.fit_transform(corpus)
    return bow_vectors, vectorizer
#TF-IDF
def encode_tfidf(corpus):
    vectorizer = TfidfVectorizer()
    tfidf_vectors = vectorizer.fit_transform(corpus)
    return tfidf_vectors, vectorizer
# Word2Vec (CBOW and Skip-Gram)
def train_word2vec(corpus, vector_size=100, window=5, min_count=1, sg=0):
    tokenized_corpus = [doc.split() for doc in corpus]
    model = Word2Vec(sentences=tokenized_corpus, vector_size=vector_size, window=window, min_count=min_count, sg=sg)
    return model

def encode_word2vec(model, tokenized_corpus):
    vectors = []
    for tokens in tokenized_corpus:
        vector = sum([model.wv[token] for token in tokens if token in model.wv], start=np.zeros(model.vector_size))
        vectors.append(vector)
    return np.array(vectors)

preprocessed_df['cleaned_text'].fillna('', inplace=True)

# Encode using Bag of Words
bow_vectors, bow_vectorizer = encode_bow(data_processed['cleaned_text'])

# Encode using TF-IDF
tfidf_vectors, tfidf_vectorizer = encode_tfidf(data_processed['cleaned_text'])

# Train Word2Vec models (CBOW and Skip-Gram)
cbow_model = train_word2vec(data_processed['cleaned_text'], sg=0)
skipgram_model = train_word2vec(data_processed['cleaned_text'], sg=1)

# Tokenized corpus for Word2Vec encoding
tokenized_corpus = [doc.split() for doc in data_processed['cleaned_text']]

# Encode using Word2Vec (CBOW)
cbow_vectors = encode_word2vec(cbow_model, tokenized_corpus)

# Encode using Word2Vec (Skip-Gram)
skipgram_vectors = encode_word2vec(skipgram_model, tokenized_corpus)

# Display the shapes of the encoded vectors to confirm
print("BoW Vectors Shape:", bow_vectors.shape)
print("TF-IDF Vectors Shape:", tfidf_vectors.shape)
print("Word2Vec CBOW Vectors Shape:", cbow_vectors.shape)
print("Word2Vec Skip-Gram Vectors Shape:", skipgram_vectors.shape)

BoW Vectors Shape: (71656, 36479)
TF-IDF Vectors Shape: (71656, 36479)
Word2Vec CBOW Vectors Shape: (71656, 100)
Word2Vec Skip-Gram Vectors Shape: (71656, 100)


<h3>Model Training</h3>

In [77]:
#importing libraries
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [76]:
# Split the data into features (X) and target (y)
X = data_processed['content']
y = data_processed['sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:


# Encode using Bag of Words
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)


# Define model training functions
def train_naive_bayes(X_train, y_train):
    nb = GaussianNB()
    nb.fit(X_train.toarray(), y_train)
    return nb

def train_svm(X_train, y_train, kernel='rbf', C=1.0):
    svm = SVC(kernel=kernel, C=C)
    svm.fit(X_train, y_train)
    return svm

def train_logistic_regression(X_train, y_train):
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train, y_train)
    return lr

def train_adaboost(X_train, y_train, n_estimators=50):
    ab = AdaBoostClassifier(n_estimators=n_estimators)
    ab.fit(X_train, y_train)
    return ab

# Train models using Bag of Words
nb_model_bow = train_naive_bayes(X_train_bow, y_train)
svm_model_bow = train_svm(X_train_bow, y_train)
lr_model_bow = train_logistic_regression(X_train_bow, y_train)
ab_model_bow = train_adaboost(X_train_bow, y_train)


# Define evaluation function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report

# Evaluate models using Bag of Words
print("Evaluating models using Bag of Words...\n")
nb_accuracy_bow, nb_report_bow = evaluate_model(nb_model_bow, X_test_bow, y_test)
print(f"Naive Bayes Accuracy (BoW): {nb_accuracy_bow}")
print(f"Naive Bayes Classification Report (BoW):\n{nb_report_bow}")

svm_accuracy_bow, svm_report_bow = evaluate_model(svm_model_bow, X_test_bow, y_test)
print(f"SVM Accuracy (BoW): {svm_accuracy_bow}")
print(f"SVM Classification Report (BoW):\n{svm_report_bow}")

lr_accuracy_bow, lr_report_bow = evaluate_model(lr_model_bow, X_test_bow, y_test)
print(f"Logistic Regression Accuracy (BoW): {lr_accuracy_bow}")
print(f"Logistic Regression Classification Report (BoW):\n{lr_report_bow}")

ab_accuracy_bow, ab_report_bow = evaluate_model(ab_model_bow, X_test_bow, y_test)
print(f"AdaBoost Accuracy (BoW): {ab_accuracy_bow}")
print(f"AdaBoost Classification Report (BoW):\n{ab_report_bow}")


MemoryError: Unable to allocate 12.7 GiB for an array with shape (57324, 29718) and data type float64

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Assuming 'data_processed' is your DataFrame and 'Sentiment' is the target variable

# Join the words in the 'Tweet content' column
#data_processed['content'] = data_processed['content'].apply(' '.join)

# Split the data into features (X) and target (y)
X = data_processed['content']
y = data_processed['sentiment']

# Create a pipeline with TF-IDF Vectorizer and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = pipeline.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Generate the classification report
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7795841473625453
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.67      0.73      2529
           1       0.78      0.86      0.82      4383
           2       0.78      0.75      0.76      3543
           3       0.76      0.79      0.78      3877

    accuracy                           0.78     14332
   macro avg       0.78      0.77      0.77     14332
weighted avg       0.78      0.78      0.78     14332



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Define a function to train and evaluate a pipeline
def train_and_evaluate(X, y, model):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a pipeline with TF-IDF Vectorizer and the specified model
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', model)
    ])

    # Fit the model on the training data
    pipeline.fit(X_train, y_train)

    # Make predictions on the testing set
    y_pred = pipeline.predict(X_test)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)

    # Generate the classification report
    report = classification_report(y_test, y_pred)

    print(f"Model: {model.__class__.__name__}")
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report:\n{report}")
    print('\n')

# Specify the models
models = [
    LogisticRegression(),
    SVC(),
    MultinomialNB(),
    DecisionTreeClassifier(),
    RandomForestClassifier()
]

# Train and evaluate each model
for model in models:
    train_and_evaluate(X, y, model)

    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: LogisticRegression
Accuracy: 0.7795841473625453
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.67      0.73      2529
           1       0.78      0.86      0.82      4383
           2       0.78      0.75      0.76      3543
           3       0.76      0.79      0.78      3877

    accuracy                           0.78     14332
   macro avg       0.78      0.77      0.77     14332
weighted avg       0.78      0.78      0.78     14332



