In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
data = pd.read_csv('data/disaster.csv')
data.head()

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,13000 people receive wildfires evacuation orde...,1
4,just got sent this photo from ruby alaska as s...,1


In [3]:
data['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [4]:
X = data['text']
y = data['target']

count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,1), max_features=12344)
# , ngram_range=(1,2)
count_vector = count_vectorizer.fit_transform(X)
td = TfidfVectorizer()
X = td.fit_transform(X).toarray()

In [5]:
dfx = pd.DataFrame(X[0].T, index=td.get_feature_names_out(), columns=["TF-IDF"])
dfx = dfx.sort_values('TF-IDF', ascending=False)
print (dfx.shape)

(18074, 1)


In [6]:
print(dfx.head())

              TF-IDF
deeds       0.419764
forgive     0.419764
allah       0.362589
reason      0.327354
earthquake  0.291161


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=1234)

In [8]:
y_train.value_counts()

0    3501
1    2589
Name: target, dtype: int64

In [9]:
y_test.value_counts()

0    841
1    682
Name: target, dtype: int64

## **SVM**

In [10]:
from sklearn.svm import SVC

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
# svm_linear_01 = SVC(kernel='linear', C=0.1)
# svm_linear_1 = SVC(kernel='linear', C=1)
# svm_linear_10 = SVC(kernel='linear', C=10)
# svm_linear_100 = SVC(kernel='linear', C=100)

## **SVM4 Sigmoid**

### **C = 0,1**

In [13]:
%%time
svm_sigmoid_01 = SVC(kernel='sigmoid', C=0.1).fit(X_train, y_train)

CPU times: total: 4min 40s
Wall time: 28min 51s


In [14]:
svm_pred4_01 = svm_sigmoid_01.predict(X_test)
print('Accuracy:', accuracy_score(y_test, svm_pred4_01))
print('F1 score:', f1_score(y_test, svm_pred4_01, average="macro"))

Accuracy: 0.7170059093893631
F1 score: 0.6709097280747447


In [15]:
cm = confusion_matrix(y_test, svm_pred4_01.round())
print(cm)

[[831  10]
 [421 261]]


In [16]:
print(classification_report(y_test, svm_pred4_01))

              precision    recall  f1-score   support

           0       0.66      0.99      0.79       841
           1       0.96      0.38      0.55       682

    accuracy                           0.72      1523
   macro avg       0.81      0.69      0.67      1523
weighted avg       0.80      0.72      0.68      1523



### **C = 1**

In [17]:
%%time
svm_sigmoid_1 = SVC(kernel='sigmoid', C=1).fit(X_train, y_train)

CPU times: total: 3min 47s
Wall time: 22min 51s


In [18]:
svm_pred4_1 = svm_sigmoid_1.predict(X_test)
print('Accuracy:', accuracy_score(y_test, svm_pred4_1))
print('F1 score:', f1_score(y_test, svm_pred4_1, average="macro"))

Accuracy: 0.8095863427445831
F1 score: 0.8028758779023891


In [19]:
cm = confusion_matrix(y_test, svm_pred4_1.round())
print(cm)

[[757  84]
 [206 476]]


In [20]:
print(classification_report(y_test, svm_pred4_1))

              precision    recall  f1-score   support

           0       0.79      0.90      0.84       841
           1       0.85      0.70      0.77       682

    accuracy                           0.81      1523
   macro avg       0.82      0.80      0.80      1523
weighted avg       0.81      0.81      0.81      1523



### **C = 10**

In [21]:
%%time
svm_sigmoid_10 = SVC(kernel='sigmoid', C=10).fit(X_train, y_train)

CPU times: total: 3min 6s
Wall time: 19min 16s


In [22]:
svm_pred4_10 = svm_sigmoid_10.predict(X_test)
print('Accuracy:', accuracy_score(y_test, svm_pred4_10))
print('F1 score:', f1_score(y_test, svm_pred4_10, average="macro"))

Accuracy: 0.7524622455679579
F1 score: 0.7483795252450487


In [23]:
cm = confusion_matrix(y_test, svm_pred4_10.round())
print(cm)

[[670 171]
 [206 476]]


In [24]:
print(classification_report(y_test, svm_pred4_10))

              precision    recall  f1-score   support

           0       0.76      0.80      0.78       841
           1       0.74      0.70      0.72       682

    accuracy                           0.75      1523
   macro avg       0.75      0.75      0.75      1523
weighted avg       0.75      0.75      0.75      1523



### **C = 100**

In [25]:
%%time
svm_sigmoid_100 = SVC(kernel='sigmoid', C=100).fit(X_train, y_train)

CPU times: total: 3min 1s
Wall time: 17min 32s


In [26]:
svm_pred4_100 = svm_sigmoid_100.predict(X_test)
print('Accuracy:', accuracy_score(y_test, svm_pred4_100))
print('F1 score:', f1_score(y_test, svm_pred4_100, average="macro"))

Accuracy: 0.7150361129349967
F1 score: 0.713103497367946


In [27]:
cm = confusion_matrix(y_test, svm_pred4_100.round())
print(cm)

[[607 234]
 [200 482]]


In [28]:
print(classification_report(y_test, svm_pred4_100))

              precision    recall  f1-score   support

           0       0.75      0.72      0.74       841
           1       0.67      0.71      0.69       682

    accuracy                           0.72      1523
   macro avg       0.71      0.71      0.71      1523
weighted avg       0.72      0.72      0.72      1523

