# Group 6
## 2301899156 - Muhammad Seitdha Fatah Roni
## 2301900012 - Michael Hakkinen
## 2301921881 - Rhamdany Ganio Teslatu
## 2301924366 - Eric Wijayanto Wirawan
## 2301934146 - Michael Rufi Tallaut Rongkos

In [12]:
# Import library
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [13]:
# Create a function to load the dataset
def Load_Dataset(file_Name):
    dataset = pd.read_csv(file_Name, encoding="ISO-8859-1")

    return dataset

# Create a function to delete unused data
def Delete_Unused_Data(dataset, column):
    dataset.drop(column, axis = 1, inplace = True)

    # delete nan
    dataset = dataset.dropna()

    return dataset

# Create a function to preprocess the data
def Preprocess_Data(data):
    # Delete all symbol from the data
    data = data.str.replace("@[\w]*", "")

    # Change all whitespace in the data to space
    data = data.str.replace("[^a-zA-Z0-9]", " ")

    # Tokenize the data
    text = data.apply(lambda x: x.split())

    # Remove stopword from the data
    stop_Words = stopwords.words('english')
    text = text.apply(lambda x:[word for word in x if not word in stop_Words])

    # Perform stemming on the data
    stemmer = SnowballStemmer("english")
    text = text.apply(lambda x: [stemmer.stem(word) for word in x])

    # De-tokenize the data
    processed = []
    for i in text:
        sentence = ""
        for s in i:
            sentence = sentence + str(s) + " "
        processed.append(sentence)

    return processed

# Create a function to convert the label to number
def Change_Label(label):
    if label == 'neutral':
        return 1
    elif label == 'positive':
        return 0
    elif label == 'negative':
        return 2

In [14]:
# Load the training dataset
dataset_Train = Load_Dataset("train.csv")

# Print the value count of each class in the dataset
print("Class count")
print(dataset_Train['sentiment'].value_counts())

# Print the first 10 data from the dataset
dataset_Train.head(10)

Class count
neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64


Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral,night,70-100,Antigua and Barbuda,97929,440.0,223
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,morning,0-20,Argentina,45195774,2736690.0,17
7,50e14c0bb8,Soooo high,Soooo high,neutral,noon,21-30,Armenia,2963243,28470.0,104
8,e050245fbd,Both of you,Both of you,neutral,night,31-45,Australia,25499884,7682300.0,3
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive,morning,46-60,Austria,9006398,82400.0,109


In [15]:
# Remove unused data from the dataset
dataset_Train = Delete_Unused_Data(dataset_Train, ['textID', 'selected_text','Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'])

# Preprocess the data
train_Data = Preprocess_Data(dataset_Train['text'])

# Change the label to integer
train_Target = dataset_Train['sentiment'].apply(lambda x: Change_Label(x))

# Create a list that store the label
label = [
    "positive",
    "neutral",
    "negative"
]

In [16]:
# Perform TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1,3)).fit(train_Data)
train_Feature = vectorizer.transform(train_Data)

In [17]:
# Divide the data into 70% training set and 30% testing set
feature_Train, feature_Test, target_Train, target_Test = train_test_split(train_Feature, train_Target, test_size=0.3, random_state=42)

# Divide the testing set into 70% validation set and 30% testing set
feature_Validation, feature_Test, target_Validation, target_Test = train_test_split(feature_Test, target_Test, test_size=0.3, random_state=42)

# Print the shape of each data
print(f"Feature Train:{feature_Train.shape}  Target Train:{target_Train.shape}")
print(f"Feature Validation:{feature_Validation.shape}  Target Validation:{target_Validation.shape}")
print(f"Feature Test:{feature_Test.shape}  Target Test:{target_Test.shape}")

Feature Train:(19236, 300749)  Target Train:(19236,)
Feature Validation:(5770, 300749)  Target Validation:(5770,)
Feature Test:(2474, 300749)  Target Test:(2474,)


In [18]:
# Create a function to calculate and print recall score, precision score and accuracy score
def Print_Metric(model_Name, target_True, target_Predict):
    # Calculate recall, precision and accuracy
    recall = recall_score(target_True, target_Predict, average="macro")
    precision = precision_score(target_True, target_Predict, average="macro")
    accuracy = accuracy_score(target_True, target_Predict)

    # Print recall, precision and accuracy
    print(f"Model {model_Name}")
    print("Recall: {:.3f}".format(recall))
    print("Precision: {:.3f}".format(precision))
    print("Accuracy: {:.3f}".format(accuracy))
    print()

In [19]:
# Create KNN model
model_KNN = KNeighborsClassifier(n_neighbors=7, algorithm="auto", leaf_size=10, p=1, n_jobs=-1)

# Train the model
model_KNN.fit(feature_Train, target_Train)

# Use the model to predict the testing set
target_Predict_KNN = model_KNN.predict(feature_Test)

In [20]:
# Create Logistic Regression model
model_LogisticRegression = LogisticRegression(max_iter = 200, penalty="l1", C=5.0, solver='liblinear', class_weight='balanced')

# Train the model
model_LogisticRegression.fit(feature_Train, target_Train)

# Use the model to predict the testing set
target_Predict_LogisticRegression = model_LogisticRegression.predict(feature_Test)



In [21]:
# Create XGBoost model
model_XGBoost = XGBClassifier(objective="multi:softprob", use_label_encoder=False, eval_metric='mlogloss', booster='gbtree', learning_rate=0.1, max_depth=20, min_child_weight=10, n_estimators=130)

# Train the model
model_XGBoost.fit(feature_Train, target_Train, eval_set=[(feature_Validation, target_Validation)], verbose=True)

# Use the model to predict the testing set 
target_Predict_XGBoost = model_XGBoost.predict(feature_Test)

[0]	validation_0-mlogloss:1.06361
[1]	validation_0-mlogloss:1.03377
[2]	validation_0-mlogloss:1.00834
[3]	validation_0-mlogloss:0.98651
[4]	validation_0-mlogloss:0.96732
[5]	validation_0-mlogloss:0.95100
[6]	validation_0-mlogloss:0.93660
[7]	validation_0-mlogloss:0.92401
[8]	validation_0-mlogloss:0.91296
[9]	validation_0-mlogloss:0.90283
[10]	validation_0-mlogloss:0.89398
[11]	validation_0-mlogloss:0.88604
[12]	validation_0-mlogloss:0.87925
[13]	validation_0-mlogloss:0.87245
[14]	validation_0-mlogloss:0.86652
[15]	validation_0-mlogloss:0.86124
[16]	validation_0-mlogloss:0.85625
[17]	validation_0-mlogloss:0.85188
[18]	validation_0-mlogloss:0.84723
[19]	validation_0-mlogloss:0.84353
[20]	validation_0-mlogloss:0.84002
[21]	validation_0-mlogloss:0.83650
[22]	validation_0-mlogloss:0.83325
[23]	validation_0-mlogloss:0.83005
[24]	validation_0-mlogloss:0.82705
[25]	validation_0-mlogloss:0.82433
[26]	validation_0-mlogloss:0.82209
[27]	validation_0-mlogloss:0.81931
[28]	validation_0-mlogloss:0.8

In [22]:
# Print the result
Print_Metric("LogisticRegression", target_Test, target_Predict_LogisticRegression)
Print_Metric("KNN", target_Test, target_Predict_KNN)
Print_Metric("XGBClassifier", target_Test, target_Predict_XGBoost)

Model LogisticRegression
Recall: 0.707
Precision: 0.720
Accuracy: 0.712

Model KNN
Recall: 0.342
Precision: 0.784
Accuracy: 0.413

Model XGBClassifier
Recall: 0.684
Precision: 0.724
Accuracy: 0.700



## Because Logistic Regression have the most highest value, Logistic Regression model will be used to predict the test dataset

In [23]:
# Load the test dataset
dataset_Test = Load_Dataset("test.csv")

# Delete unused data from test dataset
dataset_Test = Delete_Unused_Data(dataset_Test, ['textID','Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'])

# Preprocess the data
test_Data = Preprocess_Data(dataset_Test['text'])

# Change the label to integer
test_Target = dataset_Test['sentiment'].apply(lambda x: Change_Label(x))

# Use tf idf
test_Data = vectorizer.transform(test_Data)

In [24]:
# Use the model to predict the test dataset
target_Predict_Test = model_LogisticRegression.predict(test_Data)

# Print the recall, precision and accuracy
Print_Metric("LogisticRegression", test_Target, target_Predict_Test)

Model LogisticRegression
Recall: 0.716
Precision: 0.701
Accuracy: 0.702

