In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [3]:
data = pd.read_csv('data/disaster.csv')
data.head()

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,13000 people receive wildfires evacuation orde...,1
4,just got sent this photo from ruby alaska as s...,1


In [4]:
data['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [5]:
X = data['text']
y = data['target']

count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,1), max_features=12344)
# , ngram_range=(1,2)
count_vector = count_vectorizer.fit_transform(X)
td = TfidfVectorizer()
X = td.fit_transform(X).toarray()

In [6]:
dfx = pd.DataFrame(X[0].T, index=td.get_feature_names_out(), columns=["TF-IDF"])
dfx = dfx.sort_values('TF-IDF', ascending=False)
print (dfx.shape)

(18074, 1)


In [7]:
print(dfx.head())

              TF-IDF
deeds       0.419764
forgive     0.419764
allah       0.362589
reason      0.327354
earthquake  0.291161


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=1234)

In [9]:
y_train.value_counts()

0    3501
1    2589
Name: target, dtype: int64

In [10]:
y_test.value_counts()

0    841
1    682
Name: target, dtype: int64

## **SVM**

In [11]:
from sklearn.svm import SVC

In [12]:
from sklearn.model_selection import GridSearchCV

In [12]:
# svm_linear_01 = SVC(kernel='linear', C=0.1)
# svm_linear_1 = SVC(kernel='linear', C=1)
# svm_linear_10 = SVC(kernel='linear', C=10)
# svm_linear_100 = SVC(kernel='linear', C=100)

## **SVM1 Linear**

### **C = 0,001**

In [13]:
%%time
svm_linear_001 = SVC(kernel='linear', C=0.001).fit(X_train, y_train)

In [None]:
svm_pred1_0001 = svm_linear_0001.predict(X_test)
print('Accuracy:', accuracy_score(y_test, svm_pred1_0001))
print('F1 score:', f1_score(y_test, svm_pred1_0001, average="macro"))

In [None]:
cm = confusion_matrix(y_test, svm_pred1_0001.round())
print(cm)

In [None]:
print(classification_report(y_test, svm_pred1_0001))

### **C = 0,01**

In [None]:
%%time
svm_linear_001 = SVC(kernel='linear', C=0.01).fit(X_train, y_train)

In [None]:
svm_pred1_001 = svm_linear_001.predict(X_test)
print('Accuracy:', accuracy_score(y_test, svm_pred1_001))
print('F1 score:', f1_score(y_test, svm_pred1_001, average="macro"))

In [None]:
cm = confusion_matrix(y_test, svm_pred1_001.round())
print(cm)

In [None]:
print(classification_report(y_test, svm_pred1_001))

### **C = 0,1**

In [13]:
%%time
svm_linear_01 = SVC(kernel='linear', C=0.1).fit(X_train, y_train)

CPU times: total: 5min 4s
Wall time: 33min 2s


In [25]:
svm_pred1_01 = svm_linear_01.predict(X_test)
print('Accuracy:', accuracy_score(y_test, svm_pred1_01))
print('F1 score:', f1_score(y_test, svm_pred1_01, average="macro"))

Accuracy: 0.7202889034799738
F1 score: 0.6756503011542245


In [26]:
cm = confusion_matrix(y_test, svm_pred1_01.round())
print(cm)

[[831  10]
 [416 266]]


In [27]:
print(classification_report(y_test, svm_pred1_01))

              precision    recall  f1-score   support

           0       0.67      0.99      0.80       841
           1       0.96      0.39      0.56       682

    accuracy                           0.72      1523
   macro avg       0.82      0.69      0.68      1523
weighted avg       0.80      0.72      0.69      1523



### **C = 1**

In [16]:
%%time
svm_linear_1 = SVC(kernel='linear', C=1).fit(X_train, y_train)

CPU times: total: 4min 2s
Wall time: 24min 48s


In [28]:
svm_pred1_1 = svm_linear_1.predict(X_test)
print('Accuracy:', accuracy_score(y_test, svm_pred1_1))
print('F1 score:', f1_score(y_test, svm_pred1_1, average="macro"))

Accuracy: 0.8030203545633617
F1 score: 0.7965796531569556


In [29]:
cm = confusion_matrix(y_test, svm_pred1_1.round())
print(cm)

[[747  94]
 [206 476]]


In [30]:
print(classification_report(y_test, svm_pred1_1))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83       841
           1       0.84      0.70      0.76       682

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.80      1523
weighted avg       0.81      0.80      0.80      1523



### **C = 10**

In [19]:
%%time
svm_linear_10 = SVC(kernel='linear', C=10).fit(X_train, y_train)

CPU times: total: 6min 50s
Wall time: 39min 15s


In [31]:
svm_pred1_10 = svm_linear_10.predict(X_test)
print('Accuracy:', accuracy_score(y_test, svm_pred1_10))
print('F1 score:', f1_score(y_test, svm_pred1_10, average="macro"))

Accuracy: 0.7695338148391333
F1 score: 0.7651534421519463


In [32]:
cm = confusion_matrix(y_test, svm_pred1_10.round())
print(cm)

[[690 151]
 [200 482]]


In [33]:
print(classification_report(y_test, svm_pred1_10))

              precision    recall  f1-score   support

           0       0.78      0.82      0.80       841
           1       0.76      0.71      0.73       682

    accuracy                           0.77      1523
   macro avg       0.77      0.76      0.77      1523
weighted avg       0.77      0.77      0.77      1523



### **C = 100**

In [22]:
%%time
svm_linear_100 = SVC(kernel='linear', C=100).fit(X_train, y_train)

CPU times: total: 4min 24s
Wall time: 34min 6s


In [34]:
svm_pred1_100 = svm_linear_100.predict(X_test)
print('Accuracy:', accuracy_score(y_test, svm_pred1_100))
print('F1 score:', f1_score(y_test, svm_pred1_100, average="macro"))

Accuracy: 0.7695338148391333
F1 score: 0.7648919200275495


In [35]:
cm = confusion_matrix(y_test, svm_pred1_100.round())
print(cm)

[[693 148]
 [203 479]]


In [36]:
print(classification_report(y_test, svm_pred1_100))

              precision    recall  f1-score   support

           0       0.77      0.82      0.80       841
           1       0.76      0.70      0.73       682

    accuracy                           0.77      1523
   macro avg       0.77      0.76      0.76      1523
weighted avg       0.77      0.77      0.77      1523

