# More practices on SVMs for text analysis

## Multi-Class classification: English, French, Malay, Spanish

In [1]:
train_data = ['Good day to you', 'Bonne journée', 'hari yang baik kepada awak','Que tengas un buen día',
              'Comment vous appelez-vous','Qu’est-ce que vous faites,','Sudah lama tidak berjumpa',
             'Happy and excited for machine learning','Encantado de conocerte']
train_labels = [ 'English', 'French', 'Malay', 'Spanish','French','French','Malay','English','Spanish']

test_data = ['hello, I am having a good day','hola estoy teniendo un buen dia','Tolong ulang sekali lagi']
test_labels = ['English','Spanish','Malay']

In [2]:
# import relevant libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import pandas as pd

# Convert strings in training and testing dataset into vector forms
vectorizer = TfidfVectorizer()
vectorised_train_data = vectorizer.fit_transform(train_data)
vectorised_test_data = vectorizer.transform(test_data)

# fit(raw_documents[, y]): Learn a vocabulary dictionary of all tokens in the raw documents.

# fit_transform(raw_documents[, y]): Learn the vocabulary dictionary and return term-document matrix. 
#This is equivalent to fit followed by the transform, but more efficiently implemented.

# transform(raw_documents): Transform documents to document-term matrix. Extract token counts out of raw text documents 
#using the vocabulary fitted with fit or the one provided to the constructor.

# Training the model 
classifier = LinearSVC()
classifier.fit(vectorised_train_data, train_labels)
predict_labels = classifier.predict(vectorised_test_data)

# Checking model's accuracy
correct, wrong = 0,0
for i in range(len(predict_labels)):
    if predict_labels[i] == test_labels[i]:
        correct += 1
    else:
        wrong += 1
percent =  (correct/len(predict_labels)) * 100
percent = round(percent,1)
print('''The accuracy of the model's predictions is ''' + str(percent) + '%.' )

# Plotting results in table format
new_table = pd.DataFrame()
new_table ['Actual Labels'] = test_labels
new_table ['Predicted Labels'] = predict_labels
new_table

The accuracy of the model's predictions is 66.7%.


Unnamed: 0,Actual Labels,Predicted Labels
0,English,English
1,Spanish,Spanish
2,Malay,French


## Multi-label classification: Sports, Health, Technology

In [3]:
train_data = ['Expired drugs may remain effective, safe to use in a pinch',
             'Di Maria and Mbappe star as PSG win big again','England beat West Indies by six wickets in first ODI',
             'MOH requests review of S$100,000 fine on orthopaedic surgeon','Google fails to disclose microphone in Nest Secure',
              'Is AI the future of all industries?','Juventus failed to score against Altetico Madrid', 
              'Advanced neural network for image and video processing']
train_labels = [['Health'],['Sports'],['Sports'],['Health'],['Technology'],['Technology'],['Sports'],['Technology']]

test_data = ['Climate change could raise risk of congenital heart defects'
              ,'Rugby World Cup aides recovery in tsunami-damaged Kamaishi']
test_labels = [['Health'],['Sports']]

In [4]:
# import relevant libraries
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
classifier = OneVsRestClassifier(LinearSVC())

nltk.download('stopwords')
stop_words = stopwords.words('english')

# Convert strings in training and testing dataset into vector forms
vectorizer = TfidfVectorizer(stop_words=stop_words)
train_data_vectorized = vectorizer.fit_transform(train_data)
test_data_vectorized = vectorizer.transform(test_data)

# fit(raw_documents[, y]): Learn a vocabulary dictionary of all tokens in the raw documents.

# fit_transform(raw_documents[, y]): Learn the vocabulary dictionary and return term-document matrix. 
#This is equivalent to fit followed by the transform, but more efficiently implemented.

# transform(raw_documents): Transform documents to document-term matrix. Extract token counts out of raw text documents 
#using the vocabulary fitted with fit or the one provided to the constructor.

# Convert labels in training and testing dataset into binary forms
mlb = MultiLabelBinarizer()
train_labels_binary = mlb.fit_transform(train_labels)
test_labels_binary = mlb.transform(test_labels)
test_labels_list = [item for item in test_labels_binary]

# mlb.fit_transform(y) - Fit the label sets binarizer and transform the given label sets
# mlb.transform(y) - Transform the given label sets

# Model training
classifier.fit(train_data_vectorized, train_labels_binary)
pred_labels = classifier.predict(test_data_vectorized)
pred_labels_list = [item for item in pred_labels]

# Plotting results in table format
new_table = pd.DataFrame()
new_table ['Actual Labels (binary form)'] = test_labels_list
new_table ['Predicted Labels (binary form)'] = pred_labels_list
new_table

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alvin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Actual Labels (binary form),Predicted Labels (binary form)
0,"[1, 0, 0]","[0, 0, 0]"
1,"[0, 1, 0]","[0, 0, 0]"


In [5]:
# Now use the first method from the multi-class classification:)

# Training the model 
classifier = LinearSVC()
classifier.fit(train_data_vectorized, train_labels)
predict_labels = classifier.predict(test_data_vectorized)

# Checking model's accuracy
correct, wrong = 0,0
for i in range(len(predict_labels)):
    if predict_labels[i] == test_labels[i]:
        correct += 1
    else:
        wrong += 1
percent =  (correct/len(predict_labels)) * 100
percent = round(percent,1)
print('''The accuracy of the model's predictions is ''' + str(percent) + '%.' )

# Plotting results in table format
new_table = pd.DataFrame()
new_table ['Actual Labels'] = test_labels
new_table ['Predicted Labels'] = predict_labels
new_table

The accuracy of the model's predictions is 0.0%.


  y = column_or_1d(y, warn=True)


Unnamed: 0,Actual Labels,Predicted Labels
0,[Health],Sports
1,[Sports],Sports
