### Libraries

In [3]:
import pandas as pd
import re, itertools
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import preprocessor as p
from nltk.stem import WordNetLemmatizer
import little_mallet_wrapper
from nltk.tokenize import TweetTokenizer 
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from nltk.classify import SklearnClassifier
# pip install -U imbalanced-learn scikit-learn
from imblearn.over_sampling import RandomOverSampler
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
pd.options.display.max_colwidth = 100

### Dataset

In [4]:
df = pd.read_csv("full-corpus-training.csv")

# Filter out records with the "irrelevant" label
df = df[df['Sentiment'] != 'irrelevant']

df.head()

Unnamed: 0,Sentiment,TweetId,TweetText
0,positive,1.26e+17,Now all @Apple has to do is get swype on the iphone and it will be crack. Iphone that is
1,positive,1.26e+17,@Apple will be adding more carrier support to the iPhone 4S (just announced)
2,positive,1.26e+17,Hilarious @youtube video - guy does a duet with @apple 's Siri. Pretty much sums up the love aff...
3,positive,1.26e+17,@RIM you made it too easy for me to switch to @Apple iPhone. See ya!
4,positive,1.26e+17,I just realized that the reason I got into twitter was ios5 thanks @apple


In [5]:
df.shape

(2994, 3)

In [6]:
df = df.drop(['TweetId'], axis=1)
df.head()

Unnamed: 0,Sentiment,TweetText
0,positive,Now all @Apple has to do is get swype on the iphone and it will be crack. Iphone that is
1,positive,@Apple will be adding more carrier support to the iPhone 4S (just announced)
2,positive,Hilarious @youtube video - guy does a duet with @apple 's Siri. Pretty much sums up the love aff...
3,positive,@RIM you made it too easy for me to switch to @Apple iPhone. See ya!
4,positive,I just realized that the reason I got into twitter was ios5 thanks @apple


### Missing Values

In [7]:
df.isna().sum()

Sentiment    0
TweetText    0
dtype: int64

### Preprocess and cleaning the text to remove any stop words or punctuations

In [8]:
def clean_text_data(text):
    # preprocessor
    text = p.clean(text)

    # Remove HTML tags and URLs
    text = re.sub(r'<[^>]+>|http[s]?://\S+|http\S+|www\S+|https\S+', '', text)
    
    # Remove punctuation and replace words with multiple consecutive letters
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'(\w)\1{2,}', r'\1', text)
    
    # Insert a space before all capital letters in the middle of a sentence
    text = re.sub(r"(\w)([A-Z])", r"\1 \2", text)

    # Tokenize the tweet using TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    word_tokens = tokenizer.tokenize(text)

    # Stop word removal and length filtering
    stop_words = set(stopwords.words('english'))
    stop_words.remove('not')
    filtered_text = [word for word in word_tokens if word.isalnum() and len(word) > 3 and word.lower() not in stop_words]

    # Lowercase change
    text = ' '.join(filtered_text).lower()

    # Lemmatization using WordNet Lemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokenize(text)]
    text = ' '.join(lemmatized_words)

    return text

# Apply the cleaning function to the 'TweetText' column
df['CleanedTweet'] = df['TweetText'].apply(lambda x: little_mallet_wrapper.process_string(x, numbers='remove'))
df['CleanedTweet'] = df['CleanedTweet'].apply(clean_text_data)

# Display the cleaned DataFrame
print(df['CleanedTweet'].head(10))


0                                                          apple swype iphone crack iphone
1                                            apple adding carrier support iphone announced
2         hilarious youtube video duet apple siri pretty much sum love affair http exbnqjy
3                                                            made easy switch apple iphone
4                                                     realized reason twitter thanks apple
5                    current blackberry user little disappointed move android apple iphone
6    strangest thing siri said glad apple gave siri sense humor http cotwaeudbp happyplace
7                                    great close personal event apple tonight regent store
8                              company experience best customer service aside zappos apple
9                                                                    apply apple hope call
Name: CleanedTweet, dtype: object


### TfidfVectorizer

In [9]:
# TF IDF vectorizer with adjusted parameters
tfidf_vect = TfidfVectorizer()

# Fit and transform
matrix_tfidf = tfidf_vect.fit_transform(df['CleanedTweet'])

# Using get_feature_names_out
featureNames = tfidf_vect.get_feature_names_out()

# Data frame for our matrix_tfidf and featureNames
df_tfidf = pd.DataFrame(data=matrix_tfidf.toarray(), columns=featureNames)

# Adding up the importance scores (= TF-IDF scores) for every word.
wordScores = df_tfidf.sum(axis=0)

# Sorting words according to how much they matter in all the tweets
# Sorting them with their overall TF-IDF scores.
top20words = wordScores.sort_values(ascending=False).head(20)

# Print top20words
print(top20words)


twitter             154.058752
google              132.880067
http                132.453139
apple               118.209687
microsoft           112.318714
android              90.867818
iphone               51.194151
nexus                48.710277
samsung              45.992413
sandwich             43.026021
cream                42.603218
phone                40.175494
galaxy               36.254216
window               31.573166
like                 29.438681
siri                 28.974785
facebook             28.959853
ballmer              24.940511
icecreamsandwich     24.068868
steve                22.565562
dtype: float64


In [10]:
# Visualization of top words
# plt.figure(figsize=(12, 6))
# top20words.plot(kind='bar')
# plt.title('Top 50 Words and Their TF-IDF Scores')
# plt.xlabel('Words')
# plt.ylabel('TF-IDF Score')
# plt.show()



In [11]:
# vectorizer = TfidfVectorizer(max_features=5000)
# X_train_vectorized = vectorizer.fit_transform(X_train)
# X_test_vectorized = vectorizer.transform(X_test)

### Oversample the training data

In [12]:
# Oversample the training data

ros = RandomOverSampler(random_state=42)
X = df['CleanedTweet'].values.reshape(-1, 1)
y = df['Sentiment'].values
X_resampled, y_resampled = ros.fit_resample(X, y)

# convert text (object) data to string for w2v
X_resampled= [str(obj) for obj in X_resampled]
X_resampled = np.array(X_resampled)

# resource : https://www.kaggle.com/code/titanpointe/cyberbullying-tweets-eda-automl-dl-bert

### Train Word2Vec Model

In [13]:
# Train Word2Vec Model
sentences = [word_tokenize(text) for text in X_resampled]
word2vec_model = Word2Vec(sentences, vector_size=300, window=5, min_count=1, workers=4)  # Adjust parameters as needed

# Convert Text to Embeddings
def get_embedding(text):
    tokens = word_tokenize(text)
    # Filter out tokens that are not in the vocabulary
    tokens = [token for token in tokens if token in word2vec_model.wv.key_to_index]
    if len(tokens) > 0:
        # Return the average of word embeddings for the tokens
        return np.mean([word2vec_model.wv[t] for t in tokens], axis=0)
    else:
        return None

# Create an array of embeddings for each text
X_resampled = [get_embedding(text) for text in X_resampled]

# resource: https://www.kaggle.com/code/titanpointe/cyberbullying-tweets-eda-automl-dl-bert

### Train Test Split

In [14]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

X_train = np.array(X_train)
X_test = np.array(X_test)

### 1. Using LogisticRegression to train data + Evaluation

In [15]:
# Create and train the Logistic Regression model
logistic_regression_classifier = LogisticRegression(max_iter=1000)
logistic_regression_classifier.fit(X_train, y_train)

In [16]:
# Make predictions on the test set
y_pred_logistic = logistic_regression_classifier.predict(X_test)

# Evaluate the model
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logistic, zero_division=1))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

    negative       0.55      0.82      0.66       461
     neutral       0.58      0.32      0.41       459
    positive       0.54      0.52      0.53       417

    accuracy                           0.55      1337
   macro avg       0.56      0.55      0.53      1337
weighted avg       0.56      0.55      0.53      1337



### 2. Using RandomForest to train data + Evaluation

In [17]:
# Create and train the Random Forest model
random_forest_classifier = RandomForestClassifier(random_state=42)
random_forest_classifier.fit(X_train, y_train)

In [18]:
# Make predictions on the test set
y_pred_rf = random_forest_classifier.predict(X_test)

# Evaluate the model
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, zero_division=1))


Random Forest Classification Report:
              precision    recall  f1-score   support

    negative       0.93      1.00      0.96       461
     neutral       0.98      0.85      0.91       459
    positive       0.93      0.98      0.95       417

    accuracy                           0.94      1337
   macro avg       0.94      0.94      0.94      1337
weighted avg       0.95      0.94      0.94      1337




The Random Forest Classification Report provides an evaluation of a sentiment classification model using the Random Forest algorithm. Here's an interpretation of the key metrics:

* Precision:

For the "negative" class, the model has a precision of 93%, indicating that 93% of the instances predicted as negative are indeed negative.
For the "neutral" class, the precision is 98%, suggesting high accuracy in predicting neutral sentiments.
The "positive" class also has a precision of 93%, indicating that 93% of the predicted positive instances are correct.

* Recall:

The recall for the "negative" class is 100%, meaning that the model correctly identifies all actual negative instances.
For the "neutral" class, the recall is 85%, indicating that the model captures 85% of the actual neutral instances.
The "positive" class has a recall of 98%, meaning that 98% of actual positive instances are correctly identified.

* F1-Score:

The F1-score, which balances precision and recall, is 96% for the "negative" class, 91% for the "neutral" class, and 95% for the "positive" class.

* Support:

The "support" column indicates the number of instances for each sentiment class in the test set.

* Accuracy:

The overall accuracy of the model is 94%, representing the proportion of correctly predicted instances among all instances.

* Macro Avg:

The macro average calculates the unweighted average of precision, recall, and F1-score across all classes. In this case, it is 94%, indicating good overall performance.

* Weighted Avg:

The weighted average considers the number of instances in each class, providing an overall performance measure. The weighted average F1-score is 94%, suggesting balanced performance across classes.

In summary, the Random Forest model demonstrates strong performance in sentiment classification, particularly in correctly identifying negative sentiments with high precision and recall. However, there is a slight decrease in performance for the neutral class, as reflected in the lower recall.

### 3. Using DecisionTree to train data + Evaluation

In [19]:
# Create and train the Decision Tree model
decision_tree_classifier = DecisionTreeClassifier(random_state=42)
decision_tree_classifier.fit(X_train, y_train)

In [20]:
# Make predictions on the test set
y_pred_dt = decision_tree_classifier.predict(X_test)

# Evaluate the model
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt, zero_division=1))


Decision Tree Classification Report:
              precision    recall  f1-score   support

    negative       0.87      1.00      0.93       461
     neutral       0.98      0.75      0.85       459
    positive       0.90      0.98      0.94       417

    accuracy                           0.91      1337
   macro avg       0.92      0.91      0.91      1337
weighted avg       0.92      0.91      0.91      1337



### 4. Using Support Vector Machines to train data + Evaluation

In [21]:
# Create and train the Support Vector Machines model
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

In [22]:
# Make predictions on the test set
y_pred_svm = svm_classifier.predict(X_test)

# Evaluate the model
print("Support Vector Machines Classification Report:")
print(classification_report(y_test, y_pred_svm, zero_division=1))


Support Vector Machines Classification Report:
              precision    recall  f1-score   support

    negative       0.48      0.86      0.62       461
     neutral       0.63      0.16      0.25       459
    positive       0.55      0.52      0.54       417

    accuracy                           0.52      1337
   macro avg       0.55      0.52      0.47      1337
weighted avg       0.55      0.52      0.47      1337



## Use Random Forest to label the sentences in the unlabeled dataset.

In [27]:
column_names = ['Id', 'TweetText']
unlabeled_df = pd.read_excel("testing_data.xlsx", names=column_names, header=None)
unlabeled_df = unlabeled_df.drop(['Id'], axis=1)
unlabeled_df.head()

Unnamed: 0,TweetText
0,"Come to the dark side üì±‚Äú@gretcheneclark: Hey @apple, if you send me a free iPhone, I will p..."
1,"Hey @apple, if you send me a free iPhone (any version will do), I will publicly and ceremoniousl..."
2,Thank you @apple for Find My Mac - just located and wiped my stolen Air. #smallvictory #thieving...
3,Thanks to @Apple Covent Garden #GeniusBar for replacing my MacBook keyboard/cracked wristpad dur...
4,@DailyDealChat @apple Thanks!!


In [31]:
# Apply the cleaning function to the 'TweetText' column in the unlabeled dataset
unlabeled_df['CleanedTweet'] = unlabeled_df['TweetText'].apply(lambda x: little_mallet_wrapper.process_string(x, numbers='remove'))
unlabeled_df['CleanedTweet'] = unlabeled_df['CleanedTweet'].apply(clean_text_data)

# Convert Text to Embeddings in the unlabeled dataset
X_unlabeled = [get_embedding(text) for text in unlabeled_df['CleanedTweet']]
X_unlabeled = np.array(X_unlabeled)

# Make predictions on the unlabeled set
y_pred_unlabeled = random_forest_classifier.predict(X_unlabeled)

# Add the predicted labels to the unlabeled dataset
unlabeled_df['PredictedSentiment'] = y_pred_unlabeled

# Reorder columns to have 'Sentiment' first
unlabeled_df = unlabeled_df[['PredictedSentiment', 'TweetText', 'CleanedTweet']]

unlabeled_df


Unnamed: 0,PredictedSentiment,TweetText,CleanedTweet
0,neutral,"Come to the dark side üì±‚Äú@gretcheneclark: Hey @apple, if you send me a free iPhone, I will p...",come dark side gretcheneclark apple send free iphone publicly ceremoniously burn blackberry
1,neutral,"Hey @apple, if you send me a free iPhone (any version will do), I will publicly and ceremoniousl...",apple send free iphone version publicly ceremoniously burn blackberry
2,positive,Thank you @apple for Find My Mac - just located and wiped my stolen Air. #smallvictory #thieving...,thank apple find located wiped stolen smallvictory thievingbastards
3,neutral,Thanks to @Apple Covent Garden #GeniusBar for replacing my MacBook keyboard/cracked wristpad dur...,thanks apple covent garden geniusbar replacing macbook keyboardcracked wristpad lunch break toda...
4,neutral,@DailyDealChat @apple Thanks!!,dailydealchat apple thanks
...,...,...,...
459,neutral,RT......zZzZzZzZzZzZzZzZzZzZ...... #Twitter off,twitter
460,positive,Ha! Even #twitter is telling me we're meant to be!,even twitter telling meant
461,neutral,Sleep time. #twitter off,sleep time twitter
462,positive,#Twitter should have Emoticons,twitter emoticon


## Export Predicted Sentiments file

In [32]:
unlabeled_df.to_csv('predicted_sentiment_tweets.csv', index=False)

# <span style="color:green;">Report</span> 

### <span style="color:orange;">1. What text cleaning approaches did you use?</span>
##### * Preprocessor Library: 
The preprocessor library is utilized to clean the text, which includes the removal of mentions, hashtags, URLs, and reserved words (RT, FAV).
##### * Function clean_text_data: 
I defined it to clean the tweet text. It includes steps such as removing HTML tags, URLs, punctuation, and applying lemmatization.
##### * Regular Expressions:
Regular expressions are used to remove HTML tags and various forms of URLs.
Punctuation is removed, and words with consecutive letters are reduced to a single letter.
##### * TweetTokenizer:
The TweetTokenizer from NLTK is employed to tokenize the tweets, preserving case, stripping handles, and reducing repeated characters.
##### * Stopword Removal and Length Filtering:
Stopwords from the NLTK library are removed, except for the word "not."
Words with a length less than or equal to 3 and non-alphanumeric characters are filtered out.
##### * Inserting Space Before Capital Letters:
Spaces are inserted before capital letters in the middle of sentences to separate words properly.
##### * Lemmatization:
WordNet Lemmatizer from NLTK is used for lemmatization.
##### * The little_mallet_wrapper library is used to further process the text (removing numbers).



### <span style="color:orange;">2. What classification approach did you use? How did you evaluate your work?</span>

##### * Classification Models:
Four classifiers are employed: Logistic Regression, Random Forest, Decision Tree, and Support Vector Machines (SVM). Finally I chose Random Forest for prediction becasue of 94 percent accuracy.
The models are trained on Word2Vec embeddings derived from the preprocessed and cleaned tweet text. 
for Word2Vec Embedding I used this resource: https://www.kaggle.com/code/titanpointe/cyberbullying-tweets-eda-automl-dl-bert

##### * Evaluation:
The classification models are evaluated using the classification_report from scikit-learn.
Metrics such as precision, recall, F1-score, and accuracy are reported for each sentiment class. I chose Random Forest for prediction becasue of 94 percent accuracy.
The evaluation is performed on a test set that was split from the oversampled training data.


### <span style="color:orange;">3. What challenges did you face in order to identify sentiments?</span>
##### * Class Imbalance: The sentiment classes were unbalanced, with fewer positive and negative examples, so I addressed it using Random OverSampling to ensure a more balanced representation of different sentiments.

##### * Cleaning data: It was the hardest part; the data was unclean, and I spent two days to figure it out how to clean it. 

##### * Low Accuracy: My accuracy and recall in the evaluation step were low. I searched extensively on the internet until I found Word2Vec embeddings on Kaggle, and I used them. This increased my accuracy from 75% to 94%. resource: https://www.kaggle.com/code/titanpointe/cyberbullying-tweets-eda-automl-dl-bert


### <span style="color:orange;">4. What are your suggestions to further improve your method?</span>
##### * Cleaning and preprocessing: Certainly, the cleaning step needs more improvement. Cleaner and more accurate data will provide us with better results.
##### * Model Tuning: Explore hyperparameter tuning for the classifiers to enhance model performance.
##### * Ensemble Models: Considering building ensemble models to combine predictions from multiple classifiers for improved accuracy.
##### * Additional Feature Engineering: Explore additional features or sentiment lexicons that might enhance sentiment classification.

### <span style="color:orange;">Why I chose Random Forest Classification?</span>

The Random Forest Classification Report provides an evaluation of a sentiment classification model using the Random Forest algorithm. Here's an interpretation of the key metrics:

* Precision:

For the "negative" class, the model has a precision of 93%, indicating that 93% of the instances predicted as negative are indeed negative.
For the "neutral" class, the precision is 98%, suggesting high accuracy in predicting neutral sentiments.
The "positive" class also has a precision of 93%, indicating that 93% of the predicted positive instances are correct.

* Recall:

The recall for the "negative" class is 100%, meaning that the model correctly identifies all actual negative instances.
For the "neutral" class, the recall is 85%, indicating that the model captures 85% of the actual neutral instances.
The "positive" class has a recall of 98%, meaning that 98% of actual positive instances are correctly identified.

* F1-Score:

The F1-score, which balances precision and recall, is 96% for the "negative" class, 91% for the "neutral" class, and 95% for the "positive" class.

* Support:

The "support" column indicates the number of instances for each sentiment class in the test set.

* Accuracy:

The overall accuracy of the model is 94%, representing the proportion of correctly predicted instances among all instances.

* Macro Avg:

The macro average calculates the unweighted average of precision, recall, and F1-score across all classes. In this case, it is 94%, indicating good overall performance.

* Weighted Avg:

The weighted average considers the number of instances in each class, providing an overall performance measure. The weighted average F1-score is 94%, suggesting balanced performance across classes.

In summary, the Random Forest model demonstrates strong performance in sentiment classification, particularly in correctly identifying negative sentiments with high precision and recall. However, there is a slight decrease in performance for the neutral class, as reflected in the lower recall.