In [13]:
import pandas as pd
import numpy as np
import os
import pickle


In [14]:

data_path ="../data"
os.chdir(data_path)
print("Current Directory:", os.getcwd())
os.listdir(data_path)

Current Directory: /home/atawimas/Project/DEEP-LEARNING/NLP/Text-Mining_Project/data


['data.ipynb:Zone.Identifier',
 'train.csv:Zone.Identifier',
 'data.ipynb',
 'tfidf_model.pkl',
 'test.csv:Zone.Identifier',
 'train.csv',
 'test.csv']

In [15]:
train_data = pd.read_csv('train.csv') 
test_data = pd.read_csv('test.csv')

In [16]:
train_data.isna().sum()

text     4
label    0
dtype: int64

In [17]:
train_data = train_data.dropna()

In [18]:
data_path ="../Saved_Model"
os.chdir(data_path)
print("Current Directory:", os.getcwd())

Current Directory: /home/atawimas/Project/DEEP-LEARNING/NLP/Text-Mining_Project/Saved_Model


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim.downloader as api
#td-id
tfid = TfidfVectorizer() #deklrasi tfid
x_TF_IDF = tfid.fit_transform(train_data['text'])
with open('tfidf_model.pkl', 'wb') as f:
    pickle.dump(tfid, f)

#Word2Vec
glove_model = api.load("glove-wiki-gigaword-100")
def sentence_to_vec(sentence, model, vector_size=100):
    words = sentence.split()  # Split kalimat menjadi kata
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(vector_size) 
    return np.mean(word_vectors, axis=0)  

In [20]:
x_test_tfid = tfid.transform(test_data['text'])
x_train_Word2Vec = np.array([sentence_to_vec(sentence, glove_model) for sentence in train_data['text']])
x_test_Word2Vec = np.array([sentence_to_vec(sentence, glove_model) for sentence in test_data['text']])
label = train_data['label']
y_test = test_data['label']

In [21]:
data_path ="../Saved_Model"
os.chdir(data_path)
print("Current Directory:", os.getcwd())

Current Directory: /home/atawimas/Project/DEEP-LEARNING/NLP/Text-Mining_Project/Saved_Model


In [10]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV



In [23]:
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted',
    'f1': 'f1_weighted'
} 
best_estimators_df = pd.DataFrame(columns=["Text Representation", "Model", "Param", "Accuracy", "Precision", "Recall", "F1-Score"]) #buat dataframe untuk mendapatkan best estimator setiap model dan text vectorizer

In [None]:
param_grid = {
    'kernel': ['rbf'],
    'C': [1, 0.1, 0.001]
}

grid_search = GridSearchCV(
    SVC(), 
    param_grid=param_grid, 
    cv=3, 
    scoring='accuracy', 
    refit='accuracy', 
    n_jobs=-1
)

# Fit model
grid_search.fit(x_TF_IDF, label)

# Mendapatkan model terbaik
best_estimator = grid_search.best_estimator_

# Menyimpan model terbaik menggunakan pickle
with open('tfid_svc_model.pkl', 'wb') as f:
    pickle.dump(best_estimator, f)

# Melakukan prediksi menggunakan model yang dimuat
y_pred = best_estimator.predict(x_test_tfid)

# Menghitung metrik performa
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

In [79]:
# Menyimpan hasil ke dalam DataFrame
new_row = pd.DataFrame({
    "Text Representation": ["TF-IDF"],  
    "Model": ["SVC"],  
    "Param": [grid_search.best_params_], 
    "Accuracy": [accuracy],
    "Precision": [precision],
    "Recall": [recall],
    "F1-Score": [f1]
})

best_estimators_df = pd.concat([best_estimators_df, new_row], ignore_index=True)

# Menampilkan hasil
new_row

  best_estimators_df = pd.concat([best_estimators_df, new_row], ignore_index=True)


Unnamed: 0,Text Representation,Model,Param,Accuracy,Precision,Recall,F1-Score
0,TF-IDF,SVC,"{'C': 1, 'kernel': 'rbf'}",0.918947,0.917406,0.92585,0.921609


In [80]:
param_grid = {
    'kernel': ['rbf'],
    'C': [1, 0.1, 0.001]
}

grid_search = GridSearchCV(
    SVC(), 
    param_grid=param_grid, 
    cv=3, 
    scoring='accuracy', 
    refit='accuracy', 
    n_jobs=-1
)

# Fit model
grid_search.fit(x_train_Word2Vec, label)

# Mendapatkan model terbaik
best_estimator = grid_search.best_estimator_

# Menyimpan model terbaik menggunakan pickle
with open('w2v_svc_model.pkl', 'wb') as f:
    pickle.dump(best_estimator, f)

# Melakukan prediksi menggunakan model yang dimuat
y_pred = best_estimator.predict(x_test_Word2Vec)

# Menghitung metrik performa
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Accuracy: 0.8388157894736842
Precision: 0.8349127182044888
Recall: 0.856047046791102
F1-Score: 0.8453478096199974


In [81]:
# Menyimpan hasil ke dalam DataFrame
new_row = pd.DataFrame({
    "Text Representation": ["Word2Vec"],  
    "Model": ["SVC"],  
    "Param": [grid_search.best_params_], 
    "Accuracy": [accuracy],
    "Precision": [precision],
    "Recall": [recall],
    "F1-Score": [f1]
})

best_estimators_df = pd.concat([best_estimators_df, new_row], ignore_index=True)


# Menampilkan hasil
new_row

Unnamed: 0,Text Representation,Model,Param,Accuracy,Precision,Recall,F1-Score
0,Word2Vec,SVC,"{'C': 1, 'kernel': 'rbf'}",0.838816,0.834913,0.856047,0.845348


### Logistic Regression

In [82]:
# Parameter Grid untuk Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],  # Regularization parameter
    'solver': ['liblinear', 'saga']  # Solver yang digunakan
}

# Membuat GridSearchCV untuk Logistic Regression
grid_search = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3, scoring='accuracy', refit='accuracy', n_jobs=-1)

# Fit model Logistic Regression
grid_search.fit(x_TF_IDF, label)

# Menyimpan model Logistic Regression terbaik
best_lr_model = grid_search.best_estimator_

# Menyimpan model terbaik menggunakan pickle
with open('tfid_lr_model.pkl', 'wb') as f:
    pickle.dump(best_lr_model, f)

# Prediksi dengan model terbaik
y_pred_lr = best_lr_model.predict(x_test_tfid)

# Evaluasi performa model Logistic Regression
accuracy = accuracy_score(y_test, y_pred_lr)
precision = precision_score(y_test, y_pred_lr)
recall = recall_score(y_test, y_pred_lr)
f1 = f1_score(y_test, y_pred_lr)

# Menampilkan hasil
print(f"Logistic Regression Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")


Logistic Regression Best Parameters: {'C': 10, 'solver': 'liblinear'}
Accuracy: 0.8315789473684211
Precision: 0.8279730740463724
Recall: 0.8491434415750447
F1-Score: 0.8384246402423631


In [83]:
# Menyimpan hasil ke dalam DataFrame
new_row = pd.DataFrame({
    "Text Representation": ["TF-IDF"],  
    "Model": ["Logistic Regression"],  
    "Param": [grid_search.best_params_], 
    "Accuracy": [accuracy],
    "Precision": [precision],
    "Recall": [recall],
    "F1-Score": [f1]
})

best_estimators_df = pd.concat([best_estimators_df, new_row], ignore_index=True)


# Menampilkan hasil
new_row

Unnamed: 0,Text Representation,Model,Param,Accuracy,Precision,Recall,F1-Score
0,TF-IDF,Logistic Regression,"{'C': 10, 'solver': 'liblinear'}",0.912895,0.913043,0.918179,0.915604


In [84]:
# Parameter Grid untuk Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],  # Regularization parameter
    'solver': ['liblinear', 'saga']  # Solver yang digunakan
}

# Membuat GridSearchCV untuk Logistic Regression
grid_search = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3, scoring='accuracy', refit='accuracy', n_jobs=-1)

# Fit model Logistic Regression
grid_search.fit(x_train_Word2Vec, label)

# Menyimpan model Logistic Regression terbaik
best_lr_model = grid_search.best_estimator_

# Menyimpan model terbaik menggunakan pickle
with open('w2v_lr_model.pkl', 'wb') as f:
    pickle.dump(best_lr_model, f)

# Prediksi dengan model terbaik
y_pred_lr = best_lr_model.predict(x_test_Word2Vec)

# Evaluasi performa model Logistic Regression
accuracy = accuracy_score(y_test, y_pred_lr)
precision = precision_score(y_test, y_pred_lr)
recall = recall_score(y_test, y_pred_lr)
f1= f1_score(y_test, y_pred_lr)

# Menampilkan hasil
print(f"Logistic Regression Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")




Logistic Regression Best Parameters: {'C': 10, 'solver': 'saga'}
Accuracy: 0.8322368421052632
Precision: 0.8288423153692615
Recall: 0.8493991306571209
F1-Score: 0.8389948225786084




In [85]:
# Menyimpan hasil ke dalam DataFrame
new_row = pd.DataFrame({
    "Text Representation": ["Word2Vec"],  
    "Model": ["Logistic Regression"],  
    "Param": [grid_search.best_params_], 
    "Accuracy": [accuracy],
    "Precision": [precision],
    "Recall": [recall],
    "F1-Score": [f1]
})

best_estimators_df = pd.concat([best_estimators_df, new_row], ignore_index=True)


# Menampilkan hasil
new_row

Unnamed: 0,Text Representation,Model,Param,Accuracy,Precision,Recall,F1-Score
0,Word2Vec,Logistic Regression,"{'C': 10, 'solver': 'saga'}",0.832237,0.828842,0.849399,0.838995


### AFIN lexicol model

In [86]:
from afinn import Afinn

# Membuat objek AFINN
afinn = Afinn()

# Prediksi sentimen menggunakan AFINN
afinn_scores = [afinn.score(sentence) for sentence in test_data['text']]

# Asumsikan label sentimen yang diharapkan (untuk tujuan perbandingan)
# Anda mungkin ingin menyesuaikan dengan dataset Anda
y_pred= [1 if score > 0 else 0 for score in afinn_scores]  # 1 = positif, 0 = negatif

# Evaluasi performa model AFINN
accuracy_afinn = accuracy_score(y_test, y_pred)
precision_afinn = precision_score(y_test, y_pred)
recall_afinn = recall_score(y_test, y_pred)
f1_afinn = f1_score(y_test, y_pred)

# Menyimpan hasil ke dalam DataFrame
new_row_afinn = pd.DataFrame({
    "Text Representation": ["N/A"],  
    "Model": ["AFINN"],  
    "Param": ["N/A"],  
    "Accuracy": [accuracy_afinn],
    "Precision": [precision_afinn],
    "Recall": [recall_afinn],
    "F1-Score": [f1_afinn]
})

best_estimators_df = pd.concat([best_estimators_df, new_row_afinn], ignore_index=True)

# Menampilkan hasil
new_row_afinn


Unnamed: 0,Text Representation,Model,Param,Accuracy,Precision,Recall,F1-Score
0,,AFINN,,0.699342,0.637285,0.964971,0.767619


### Vader Model

In [24]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Membuat objek VADER
vader = SentimentIntensityAnalyzer()

# Prediksi sentimen menggunakan VADER
vader_scores = [vader.polarity_scores(sentence)['compound'] for sentence in test_data['text']]

# Mengubah skor menjadi label sentimen (positif/negatif)
y_pred = [1 if score > 0 else 0 for score in vader_scores]  # 1 = positif, 0 = negatif

# Evaluasi performa model VADER
accuracy_vader = accuracy_score(y_test, y_pred)
precision_vader = precision_score(y_test, y_pred)
recall_vader = recall_score(y_test, y_pred)
f1_vader = f1_score(y_test, y_pred)

# Menyimpan hasil ke dalam DataFrame
new_row_vader = pd.DataFrame({
    "Text Representation": ["N/A"],  
    "Model": ["VADER"],  
    "Param": ["N/A"],  
    "Accuracy": [accuracy_vader],
    "Precision": [precision_vader],
    "Recall": [recall_vader],
    "F1-Score": [f1_vader]
})

best_estimators_df = pd.concat([best_estimators_df, new_row_vader], ignore_index=True)

# Menampilkan hasil
new_row_vader


  best_estimators_df = pd.concat([best_estimators_df, new_row_vader], ignore_index=True)


Unnamed: 0,Text Representation,Model,Param,Accuracy,Precision,Recall,F1-Score
0,,VADER,,0.661579,0.60632,0.976221,0.748041


In [88]:
best_estimators_df

Unnamed: 0,Text Representation,Model,Param,Accuracy,Precision,Recall,F1-Score
0,TF-IDF,SVC,"{'C': 1, 'kernel': 'rbf'}",0.918947,0.917406,0.92585,0.921609
1,Word2Vec,SVC,"{'C': 1, 'kernel': 'rbf'}",0.838816,0.834913,0.856047,0.845348
2,TF-IDF,Logistic Regression,"{'C': 10, 'solver': 'liblinear'}",0.912895,0.913043,0.918179,0.915604
3,Word2Vec,Logistic Regression,"{'C': 10, 'solver': 'saga'}",0.832237,0.828842,0.849399,0.838995
4,,AFINN,,0.699342,0.637285,0.964971,0.767619
5,,VADER,,0.661579,0.60632,0.976221,0.748041
