In [1]:
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from google.colab import files
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import KFold
from collections import Counter

## Data Understanding and Preparation

In [3]:
# membaca data
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

Saving Clear vs Vague Word for Code Documentation.csv to Clear vs Vague Word for Code Documentation.csv


In [4]:
# data cleaning

df['Words'] = df['Words'].str.lower()
df['Words'] = df['Words'].str.replace(r'\d+', '')
df['Words'] = df['Words'].str.replace(r'[^\w\s]', '')
df['Words'] = df['Words'].str.replace(r'\n', ' ')
df['Words'] = df['Words'].str.replace(r'\r', ' ')

df['Words'] = df['Words'].str.split()
df = df.explode('Words')

In [5]:
# data understanding

print('Shape: ',df.shape)
print('Columns: ',df.columns)
print(df.dtypes)
display(df)

print('\nClass distribution: \n',df['Sentiment'].value_counts())

print('\nDuplicate rows:', df.duplicated().sum())

print('\nBasic statistics: \n',df.describe())

print('\nMissing value: \n',df.isnull().sum())

Shape:  (232, 2)
Columns:  Index(['Words', 'Sentiment'], dtype='object')
Words        object
Sentiment    object
dtype: object


Unnamed: 0,Words,Sentiment
0,input,+
1,output,+
2,parameter,+
3,argument,+
4,function,+
...,...,...
197,allow,-
198,make,-
198,smaller,-
199,make,-



Class distribution: 
 Sentiment
-    132
+    100
Name: count, dtype: int64

Duplicate rows: 109

Basic statistics: 
        Words Sentiment
count    232       232
unique   119         2
top     make         -
freq      11       132

Missing value: 
 Words        0
Sentiment    0
dtype: int64


In [6]:
# data preparation

# menghapus baris duplikat
df = df.drop_duplicates()
print('Shape after removing duplicates: ',df.shape)

# oversampling
class_counts = df['Sentiment'].value_counts()
minority_class = class_counts.idxmin()

oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_resampled, y_resampled = oversampler.fit_resample(df.drop('Sentiment', axis=1), df['Sentiment'])

df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=df.columns.drop('Sentiment')), pd.DataFrame(y_resampled, columns=['Sentiment'])], axis=1)

print('Shape after oversampling: ', df_resampled.shape)
print('\nClass distribution: \n',df_resampled['Sentiment'].value_counts())

# membagi data training, validation dengan K-Fold Cross Validation
X = df_resampled.drop(columns="Sentiment")
y = df_resampled["Sentiment"].copy()

k = 5

kf = KFold(n_splits=k, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[test_index]


X_train_fold = X_train_fold.values.reshape(-1)
X_val_fold = X_val_fold.values.reshape(-1)

print('Rows training data: ', X_train_fold.shape[0])
print('Rows validation data: ', X_val_fold.shape[0])

Shape after removing duplicates:  (123, 2)
Shape after oversampling:  (124, 2)

Class distribution: 
 Sentiment
+    62
-    62
Name: count, dtype: int64
Rows training data:  100
Rows validation data:  24


## Naive Bayes: Data Modelling

In [7]:
# bentuk vocabulary dan hitung frequency setiap term
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train_fold)

# Train Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train_fold)

print('Model Naive Bayes:')
print(clf.class_log_prior_)
print(clf.feature_log_prob_)

Model Naive Bayes:
[-0.67334455 -0.71334989]
[[-4.31748811 -5.01063529 -5.01063529 -5.01063529 -4.31748811 -4.31748811
  -5.01063529 -4.31748811 -4.31748811 -5.01063529 -5.01063529 -4.31748811
  -5.01063529 -5.01063529 -5.01063529 -5.01063529 -4.31748811 -5.01063529
  -5.01063529 -4.31748811 -4.31748811 -4.31748811 -5.01063529 -5.01063529
  -4.31748811 -5.01063529 -5.01063529 -4.31748811 -4.31748811 -5.01063529
  -4.31748811 -4.31748811 -4.31748811 -5.01063529 -4.31748811 -5.01063529
  -5.01063529 -5.01063529 -5.01063529 -4.31748811 -4.31748811 -4.31748811
  -4.31748811 -4.31748811 -4.31748811 -4.31748811 -5.01063529 -4.31748811
  -4.31748811 -5.01063529 -4.31748811 -4.31748811 -5.01063529 -5.01063529
  -5.01063529 -5.01063529 -4.31748811 -4.31748811 -4.31748811 -4.31748811
  -4.31748811 -4.31748811 -4.31748811 -5.01063529 -4.31748811 -4.31748811
  -4.31748811 -4.31748811 -5.01063529 -5.01063529 -4.31748811 -5.01063529
  -4.31748811 -5.01063529 -4.31748811 -4.31748811 -4.31748811 -5.01

In [8]:
# data ditampilkan dalam bentuk dataframe
df.head()

Unnamed: 0,Words,Sentiment
0,input,+
1,output,+
2,parameter,+
3,argument,+
4,function,+


In [9]:
# vocabulary
print('Vocabulary: ',vectorizer.get_feature_names_out())
print('Ukuran vocabulary (|V|): ',X_train.shape[1])

Vocabulary:  ['access' 'add' 'against' 'apart' 'append' 'argument' 'arrange'
 'authenticate' 'authorize' 'back' 'bigger' 'boolean' 'break' 'call'
 'change' 'check' 'class' 'code' 'combine' 'compare' 'concatenate'
 'condition' 'convert' 'create' 'customize' 'data' 'decimal' 'decode'
 'decompress' 'decrease' 'decrement' 'decrypt' 'define' 'describe'
 'deserialize' 'do' 'doohickey' 'down' 'efficiency' 'encode' 'encrypt'
 'error' 'evaluate' 'exceed' 'exception' 'execute' 'false' 'filter'
 'float' 'from' 'function' 'generate' 'get' 'give' 'go' 'increase'
 'increment' 'initialize' 'input' 'instance' 'instantiate' 'integer'
 'invoke' 'join' 'loop' 'merge' 'method' 'modify' 'not' 'number' 'object'
 'oh' 'optimize' 'over' 'parameterize' 'parse' 'reference' 'remove'
 'replace' 'retrieve' 'return' 'run' 'secret' 'serialize' 'set' 'special'
 'stuff' 'synchronize' 'terminate' 'thing' 'thingify' 'things' 'thingy'
 'to' 'true' 'uh' 'up' 'variable' 'verify']
Ukuran vocabulary (|V|):  99


In [10]:
# matriks term-document (frequency per dokumen)
df_freq = pd.DataFrame(X_train.toarray(), columns = vectorizer.get_feature_names_out())
df_freq.head()

Unnamed: 0,access,add,against,apart,append,argument,arrange,authenticate,authorize,back,...,thing,thingify,things,thingy,to,true,uh,up,variable,verify
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [11]:
# matriks term-class (frequency per kelas)
freq_neg=df_freq[0:3].sum().values
freq_pos=df_freq[3:].sum().values
df_freq_class = pd.DataFrame([freq_neg,freq_pos], columns = vectorizer.get_feature_names_out())
df_freq_class.head()

Unnamed: 0,access,add,against,apart,append,argument,arrange,authenticate,authorize,back,...,thing,thingify,things,thingy,to,true,uh,up,variable,verify
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,0,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [12]:
# matriks term-class (probability per kelas)
prob_neg=(df_freq_class.iloc[0]+1)/(df_freq_class.iloc[0].sum()+df_freq_class.shape[1])
prob_pos=(df_freq_class.iloc[1]+1)/(df_freq_class.iloc[1].sum()+df_freq_class.shape[1])
df_prob_class = pd.DataFrame([prob_neg,prob_pos], columns = vectorizer.get_feature_names_out())
df_prob_class.head()

Unnamed: 0,access,add,against,apart,append,argument,arrange,authenticate,authorize,back,...,thing,thingify,things,thingy,to,true,uh,up,variable,verify
0,0.009804,0.009804,0.009804,0.009804,0.009804,0.019608,0.009804,0.009804,0.009804,0.009804,...,0.009804,0.009804,0.009804,0.009804,0.009804,0.009804,0.009804,0.009804,0.009804,0.009804
1,0.010101,0.010101,0.010101,0.010101,0.010101,0.005051,0.010101,0.010101,0.010101,0.010101,...,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101,0.010101


In [13]:
# matriks term-class (log probability per kelas)
import numpy as np
df_logprob_class=np.log(df_prob_class) #natural logarithmic
df_logprob_class.head()

Unnamed: 0,access,add,against,apart,append,argument,arrange,authenticate,authorize,back,...,thing,thingify,things,thingy,to,true,uh,up,variable,verify
0,-4.624973,-4.624973,-4.624973,-4.624973,-4.624973,-3.931826,-4.624973,-4.624973,-4.624973,-4.624973,...,-4.624973,-4.624973,-4.624973,-4.624973,-4.624973,-4.624973,-4.624973,-4.624973,-4.624973,-4.624973
1,-4.59512,-4.59512,-4.59512,-4.59512,-4.59512,-5.288267,-4.59512,-4.59512,-4.59512,-4.59512,...,-4.59512,-4.59512,-4.59512,-4.59512,-4.59512,-4.59512,-4.59512,-4.59512,-4.59512,-4.59512


In [14]:
# prior class probability from model clf
import math

print(clf.class_log_prior_)
newlist = [math.exp(x) for x in clf.class_log_prior_]
newlist

[-0.67334455 -0.71334989]


[0.5099999999999998, 0.4899999999999997]

In [15]:
# conditional probability from model clf
newlist = [[math.exp(y) for y in x] for x in clf.feature_log_prob_]
df_featprob = pd.DataFrame(newlist, columns = vectorizer.get_feature_names_out())
df_featprob.head()

Unnamed: 0,access,add,against,apart,append,argument,arrange,authenticate,authorize,back,...,thing,thingify,things,thingy,to,true,uh,up,variable,verify
0,0.013333,0.006667,0.006667,0.006667,0.013333,0.013333,0.006667,0.013333,0.013333,0.006667,...,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.013333,0.013333
1,0.006667,0.013333,0.013333,0.013333,0.006667,0.006667,0.013333,0.006667,0.006667,0.013333,...,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.006667,0.006667


## Naive Bayes: Data Evaluation



In [16]:
X_val_transformed = vectorizer.transform(X_val_fold)

predictions = clf.predict(X_val_transformed)
probabilities = clf.predict_proba(X_val_transformed)
print('Prediksi: ',predictions)
print('Prediksi dgn peluang: ',probabilities)
f1 = f1_score(y_val_fold, predictions, average='weighted')
print('F1 Score: ', f1)

Prediksi:  ['+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '-']
Prediksi dgn peluang:  [[0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.51       0.49      ]
 [0.34228188 0.65771812]]
F1 Score:  0.3739495798319328


In [18]:
import pickle

# Menyusun dictionary dengan model dan vectorizer
saved_items = {
    'model': clf,
    'vectorizer': vectorizer
}

# Menyimpan dictionary ke dalam file menggunakan pickle
with open('sentiment_analysis.pkl', 'wb') as file:
    pickle.dump(saved_items, file)

In [17]:
import nltk
nltk.download('punkt')

uploaded = files.upload()
file_name = list(uploaded.keys())[0]
with open(file_name, "r", encoding="utf-8") as file:
    paragraph = file.read()

# Tokenize paragraph into sentences
sentences = nltk.sent_tokenize(paragraph)

# Initialize list to store predicted labels for each sentence
sentence_predictions = []

# Transform each sentence using the same CountVectorizer
for sentence in sentences:
    # Transform sentence using CountVectorizer
    sentence_transformed = vectorizer.transform([sentence])

    # Convert sparse matrix representation to dense representation
    sentence_dense = sentence_transformed.toarray()

    # Predict sentiment for the sentence
    prediction = clf.predict(sentence_dense)
    sentence_predictions.append(prediction)

# Print predicted labels for each sentence
for i, sentence in enumerate(sentences):
    print("Sentence:", sentence)
    print("Predicted Label:", sentence_predictions[i])

sentence_predictions_tuples = [tuple(pred) for pred in sentence_predictions]
sentiment_counts = Counter(sentence_predictions_tuples)
total_sentences = len(sentences)

# Print the percentage of each sentiment prediction
for sentiment, count in sentiment_counts.items():
    percentage = (count / total_sentences) * 100
    print(f"Sentiment: {sentiment}, Percentage: {percentage:.2f}%")

# Determine the major sentiment for the paragraph
major_sentiment = max(sentiment_counts, key=sentiment_counts.get)
print("Major Sentiment:", major_sentiment)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Saving codedocum_good.txt to codedocum_good.txt
Sentence: # This function iterates through a list of items and returns the last item.
Predicted Label: ['+']
Sentence: def get_last_item(items):
    """
    Retrieves the last item from a list.
Predicted Label: ['-']
Sentence: Args:
        items (list): A list of items.
Predicted Label: ['+']
Sentence: Returns:
        The last item from the list, or None if the list is empty.
    """
Predicted Label: ['-']
Sentence: result = None
    # Iterate through the list to find the last item
    for item in items:
        result = item
    return result

# This class represents a product with a name and price.
Predicted Label: ['+']
Sentence: class Product:
    """
    Represents a product with a name and price.
    """
Predicted Label: ['+']
Sentence: def __init__(self, name, price):
        """
        Initializes the product with a name and price.
Predicted Label: ['+']
Sentence: Args:
            name (str): The name of the product.
Predicted