In [2]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import pickle

In [3]:
df = pd.read_csv ("train.csv")
print (df.head(10))

   id  label                                              tweet
0   1      0   @user when a father is dysfunctional and is s...
1   2      0  @user @user thanks for #lyft credit i can't us...
2   3      0                                bihday your majesty
3   4      0  #model   i love u take with u all the time in ...
4   5      0             factsguide: society now    #motivation
5   6      0  [2/2] huge fan fare and big talking before the...
6   7      0   @user camping tomorrow @user @user @user @use...
7   8      0  the next school year is the year for exams.ð...
8   9      0  we won!!! love the land!!! #allin #cavs #champ...
9  10      0   @user @user welcome here !  i'm   it's so #gr...


In [4]:
def data_processing(tweet):
    #membersihkan teks
    tweet = tweet.lower()
    tweet = re.sub(r"^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$","",tweet, flags = re.MULTILINE)
    tweet = re.sub(r"\@\w+|\#","",tweet)
    tweet = re.sub(r"[^\w\s]","",tweet)
    tweet = re.sub(r'ð','',tweet)
    #token
    tweet_tokens = word_tokenize(tweet)
    filtered_tweet = [w for w in tweet_tokens if not w in stop_words]
    return " ".join(filtered_tweet)

In [5]:
df.tweet = df["tweet"].apply(data_processing) #buat manggil fungsi data_processing

In [6]:
df = df.drop_duplicates("tweet")
print(df)

          id  label                                              tweet
0          1      0  father dysfunctional selfish drags kids dysfun...
1          2      0  thanks lyft credit cant use cause dont offer w...
2          3      0                                     bihday majesty
3          4      0                        model love u take u time ur
4          5      0                      factsguide society motivation
...      ...    ...                                                ...
31956  31957      0     fishing tomorrow carnt wait first time 2 years
31957  31958      0                                    ate isz youuuâï
31958  31959      0  see nina turner airwaves trying wrap mantle ge...
31959  31960      0    listening sad songs monday morning otw work sad
31961  31962      0                                       thank follow

[28424 rows x 3 columns]


In [7]:
lemmatizer = WordNetLemmatizer()
def lemmatizing(data):
    data_tokenize = word_tokenize(data)
    tweet = [lemmatizer.lemmatize(word) for word in data_tokenize]
    return " ".join(tweet)

In [8]:
df['tweet'] = df['tweet'].apply(lambda x: lemmatizing(x))

In [9]:
print(df["tweet"])

0        father dysfunctional selfish drag kid dysfunct...
1        thanks lyft credit cant use cause dont offer w...
2                                           bihday majesty
3                              model love u take u time ur
4                            factsguide society motivation
                               ...                        
31956        fishing tomorrow carnt wait first time 2 year
31957                                      ate isz youuuâï
31958    see nina turner airwave trying wrap mantle gen...
31959       listening sad song monday morning otw work sad
31961                                         thank follow
Name: tweet, Length: 28424, dtype: object


In [10]:
vect = TfidfVectorizer(ngram_range=(1,3)).fit(df['tweet'])

In [11]:
feature_names = vect.get_feature_names_out()
print("Number of features: {}\n".format(len(feature_names)))
print("First 20 features: \n{}".format(feature_names[:20]))

Number of features: 357592

First 20 features: 
['0000001' '0000001 polluting' '0000001 polluting niger' '00027'
 '00027 photooftheday' '00027 photooftheday music' '001' '0035' '00h30'
 '01' '01 4995' '01 4995 rustic' '01 7900' '01 7900 shopalyssas' '01 blog'
 '01 blog silver' '01 croatia' '01 croatia happy' '01 may' '01 may actual']


In [12]:
X = df['tweet']
Y = df['label']
V = vect.transform(X)

Tfidf_Save = "Tfidf.pkl"  

with open(Tfidf_Save, "wb") as file:  
    pickle.dump(vect, file)

In [23]:
x_train, x_test, y_train, y_test = train_test_split(V, Y, test_size=0.30, random_state=42)

In [24]:
print("Size of x_train:", (x_train.shape))
print("Size of y_train:", (y_train.shape))
print("Size of x_test: ", (x_test.shape))
print("Size of y_test: ", (y_test.shape))

Size of x_train: (19896, 357592)
Size of y_train: (19896,)
Size of x_test:  (8528, 357592)
Size of y_test:  (8528,)


In [34]:
SVM = svm.SVC(kernel='linear') # Linear Kernel

SVM.fit(x_train, y_train)

y_pred = SVM.predict(x_test)

Pkl_Filename = "SVM_Model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(SVM, file)

print()




In [32]:
confusion_matrix(y_test, y_pred)

array([[7937,    8],
       [ 454,  129]], dtype=int64)

In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      7945
           1       0.94      0.22      0.36       583

    accuracy                           0.95      8528
   macro avg       0.94      0.61      0.67      8528
weighted avg       0.95      0.95      0.93      8528

