In [None]:

# Importing the necessary packages
import emoji
import pandas as pd
import numpy as np
import re
import string
import nltk
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.decomposition import PCA
from nltk.stem import PorterStemmer
from sklearn.datasets import make_classification


In [None]:

# Read in the desired dataset
dataset = pd.read_csv(filepath)
dataset['age'] = dataset['age'].astype(int)
print(dataset)

In [None]:

# Removing the special characters from the tweets.
# We wrote our own function to retain the emojis
special_characters = ['~', ':', "'", '+', '[', '\\', '@', '^',
                      '{', '%', '(', '-', '"', '*', '|', ',', '&', '<', '`', '}', '.', '_', '=', ']', '>', ';', '#', '$', ')','!','?', '/', '’', '“', '”', "…"]

myoldemolist = dataset.loc[:, "clean"].tolist()

def replace_special(myemolist, myspeciallist):
    for i in myspeciallist:
        for j in range(len(myemolist)):
            myemolist[j] = myemolist[j].replace(i, "")
    return myemolist

my_new_clean = replace_special(myoldemolist, special_characters)

# Updating our dataset data frame to include the 

dataset["myNewClean"] = my_new_clean


In [None]:
# ONLY RUN THIS CELL IF YOU WANT TO CREATE A MODEL THAT DOESN'T CONSIDER EMOJIS
# Define a function to remove emojis using a regular expression
def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F700-\U0001F77F"  # alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                           u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                           u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                           u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                           u"\U0001FB00-\U0001FBFF"  # Symbols and Pictographs Extended-B
                           u"\U0001F004-\U0001F0CF"  # Mahjong Tiles
                           u"\U0001F170-\U0001F251"  # Enclosed Ideographic Supplement
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Apply the removal function to the 'text' column
dataset['myNewClean'] = dataset['myNewClean'].apply(remove_emojis)

In [None]:
#REMOVING STOPWORDS FROM TWEETS
from nltk.corpus import stopwords
stop = stopwords.words('english')

dataset['myNewClean'] = dataset['myNewClean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
#tokenized, separated the emojis
from nltk.tokenize.casual import TweetTokenizer
t = TweetTokenizer()
dataset['tokenized'] = dataset.apply(lambda x: t.tokenize(x['myNewClean']), axis=1)

word_list = dataset['tokenized'].tolist()

In [None]:
# Creating a TF-IDF vectorizer using the dataset

tokenized_tweets = list(dataset['tokenized'])

def identity_tokenizer(text):
    return text

tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features = 1000)  

tf_tweets = tfidf.fit_transform(tokenized_tweets)
tfidf_array = tf_tweets.toarray()

In [None]:
# Importing xgboost to use gradient decision trees
import xgboost as xgb

In [None]:
###GRADIENT BOOSTED DECISION TREE

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split


# Split the dataset into training and testing sets
#age-1
X_train, X_test, y_train, y_test = train_test_split(tfidf_array,dataset['age']-1, test_size=0.2, random_state=42)

In [None]:
# Initializing the model, change parameters as needed
model = xgb.XGBClassifier(
    objective='multi:softmax',  # For multi-class classification
    num_class=len(np.unique(y_train)),
    max_depth=6,                # Maximum depth of each tree
    n_estimators=500,           # Number of boosting rounds
    learning_rate=0.1,          # Learning rate (step size shrinkage)
    subsample=0.8,              # Fraction of samples used for training each tree
    colsample_bytree=0.8,       # Fraction of features used for training each tree
    random_state=42             # Seed for reproducibility
)


In [None]:
# Creating the model
model.fit(X_train, y_train)

In [None]:
# Predicting the test set using the model and finding the accuracy
y_pred = model.predict(X_test)
accuracy = sklearn.metrics.accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
# Calculating overall precision, recall and F1 scores
from sklearn.metrics import precision_recall_fscore_support

# define the positive class
pos_label = 1

# calculate precision, recall, and F1 score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, pos_label=pos_label, average='weighted')

print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1_score)

In [None]:

# Calculate precision, recall, and F1 score for each class
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

for class_label in range(5):
    print(f"Class {class_label}:")
    print(f"Precision: {precision[class_label]}")
    print(f"Recall: {recall[class_label]}")
    print(f"F1 Score: {f1[class_label]}")


In [None]:
# Visualizing precision, recall, and F1 scores by class
import matplotlib.pyplot as plt
import numpy as np

# List of class labels (e.g., Class 0, Class 1, Class 2)
class_labels = [f'Class {i}' for i in range(len(precision))]

# Values for precision, recall, and F1 score for each class
precision_values = precision
recall_values = recall
f1_values = f1

# Create subplots for precision, recall, and F1 score
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(8, 12))

# Plot precision
axes[0].bar(class_labels, precision_values, color='b', alpha=0.7)
axes[0].set_ylabel('Precision')
axes[0].set_title('Precision for Each Class')

# Plot recall
axes[1].bar(class_labels, recall_values, color='g', alpha=0.7)
axes[1].set_ylabel('Recall')
axes[1].set_title('Recall for Each Class')

# Plot F1 score
axes[2].bar(class_labels, f1_values, color='r', alpha=0.7)
axes[2].set_ylabel('F1 Score')
axes[2].set_title('F1 Score for Each Class')

# Adjust spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()
plt.savefig()