# Classification of UN Speaches

In [1]:
import pandas as pd
import re

import nltk
nltk.download('stopwords')
nltk.download("wordnet")
nltk.download('omw-1.4')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix

from pycaret.classification import *

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Step 1 - Load the Dataset

In [2]:
df = pd.read_csv("data/source/un-general-debates-blueprint.csv.gz")
print(df.shape)
df.sample(5)

(7507, 7)


Unnamed: 0,session,year,country,country_name,speaker,position,text
998,33,1978,DOM,Dominican Republic,Jimenez,,﻿\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
6469,65,2010,MWI,Malawi,Bingu Wa Mutharika,President,I am delighted to address \nthe sixty-fifth se...
5397,60,2005,ALB,Albania,Besnik Mustafaj,Minister for Foregn Affairs,"Let\nme first convey, on behalf of the people ..."
2023,40,1985,DOM,Dominican Republic,Mr. Vega Imbert,,This session of the General Assembly is of a v...
5144,58,2003,OMN,Oman,Yousef Bin Al-Alawi Bin Abdulla,Minister for Foreign Affairs,"﻿Allow\nme, Sir, on behalf of the Sultanate of..."


In [3]:
### Only pick the five countries in the UN Security Cpuncil

SECURITY_COUNCIL = ["USA", "FRA", "GBR", "RUS", "CHN"]

df_council = df[df["country"].isin(SECURITY_COUNCIL)][["country", "text"]].reset_index(drop=True)
print(df_council.shape)
df_council.head()

(228, 2)


Unnamed: 0,country,text
0,FRA,"84.\t Within one month, when we celebrate the..."
1,GBR,"110.\t Mr. President, I should like first to s..."
2,USA,1.\t It is my privilege to extend to you once ...
3,CHN,"1.\t Mr. President, it is my pleasant duty, o..."
4,FRA,34.\tIt is toward Asia where all the problems ...


In [4]:
### Check for missing values

df_council.isna().sum()

country    0
text       0
dtype: int64

## Step 2 - Normalize the Text

In [5]:
def normalize(text, stemm="porter", lemm="wordnet", stopwords="nltk"):

    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    list_words = text.split()

    ## remove Stopwords
    if stopwords == "nltk":
        list_stopwords = nltk.corpus.stopwords.words("english")
        list_words = [word for word in list_words if word not in list_stopwords]
    elif stopwords == "spacy":
        pass
    elif stopwords == "gensim":
        pass
    else:
        pass
                
    ## Stemming (remove -ing, -ly, ...)
    if stemm == "porter":
        ps = nltk.stem.porter.PorterStemmer()
        list_words = [ps.stem(word) for word in list_words]
    elif stemm == "snowball":
        ps = nltk.stem.snowball.SnowballStemmer("english")
        list_words = [ps.stem(word) for word in list_words]
    else:
        pass
                
    ## Lemmatisation (convert the word into root word)
    if lemm == "wordnet":
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        list_words = [lem.lemmatize(word) for word in list_words]
    else:
        pass

    return " ".join(list_words)

In [6]:
df_council["normalized"] = df_council["text"].apply(normalize, args=["snowball", "wordnet", "nltk"])

df_council.head(5)

Unnamed: 0,country,text,normalized
0,FRA,"84.\t Within one month, when we celebrate the...",84 within one month celebr twentyfifth anniver...
1,GBR,"110.\t Mr. President, I should like first to s...",110 mr presid like first say glad sit presid a...
2,USA,1.\t It is my privilege to extend to you once ...,1 privileg extend warm congratul unit state de...
3,CHN,"1.\t Mr. President, it is my pleasant duty, o...",1 mr presid pleasant duti behalf govern deleg ...
4,FRA,34.\tIt is toward Asia where all the problems ...,34 toward asia problem world ferment whose anc...


## Step 3 - Split the Data into Training and Test 

In [7]:
X = df_council["normalized"]
y = df_council["country"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(182,) (182,)
(46,) (46,)


## Step 4 - Vectorize the Training and Test Data

In [8]:
tfidf = TfidfVectorizer()

X_train_vectorized = tfidf.fit_transform(X_train)    # Fit with the train data
X_test_vectorized = tfidf.transform(X_test)          # used the same vectorize to transform the test data

X_names = tfidf.get_feature_names()

df_X_train_vectorized =  pd.DataFrame(X_train_vectorized.toarray(), columns=X_names, index=X_train.index)

print(df_X_train_vectorized.shape)

df_X_test_vectorized =  pd.DataFrame(X_test_vectorized.toarray(), columns=X_names, index=X_test.index)

print(df_X_test_vectorized.shape)

(182, 10681)
(46, 10681)


## Step 5 - Dimensionality Reduction using Chi-squared

In [2]:
ch2 = SelectKBest(chi2, k=200)
X_train_chi2 = ch2.fit_transform(X_train_vectorized, y_train)
X_test_chi2 = ch2.transform(X_test_vectorized)

# feature_names = X_names[ch2.get_support()]

df_X_train_chi2 =  pd.DataFrame(X_train_chi2.toarray(),  index=X_train.index)

print(df_X_train_chi2.shape)

df_X_test_chi2 =  pd.DataFrame(X_test_chi2.toarray(),  index=X_test.index)

print(df_X_test_chi2.shape)

NameError: name 'SelectKBest' is not defined

In [10]:
df_train_chi2 = pd.concat([y_train, df_X_train_chi2], axis=1)

df_test_chi2 = pd.concat([y_test, df_X_test_chi2], axis=1)



In [11]:
df_train_chi2.head()

Unnamed: 0,country,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
52,USA,0.0,0.0,0.014876,0.008564,0.029409,0.016679,0.058248,0.009984,0.033135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014494,0.0,0.013062,0.0,0.0,0.0,0.0,0.042967,0.03851,0.0,0.008865,0.013684,0.010559,0.0,0.026529,0.0,0.012772,0.019968,0.015008,0.0,0.0,0.0,0.041122,0.024971,0.0,0.0,0.025256,0.009744,0.0,0.0,0.0,0.0,0.0,0.009074,0.0,0.0,0.0,0.041122,0.0,0.0,0.0,0.0,0.0,0.00728,0.0,0.0,0.0,0.0,0.0,0.0,0.042093,0.013684,0.019726,0.0,0.012679,0.161338,0.0,0.184261,0.0,0.0,0.097284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012495,0.0,0.068512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008183,0.012059,0.0,0.0,0.0,0.058082,0.019606,0.0,0.3145,0.0,0.028332,0.0,0.072826,0.0,0.024811,0.014166,0.0,0.0,0.0,0.0,0.0,0.0,0.224643,0.0,0.0,0.016184,0.0,0.0,0.053268,0.027152,0.0,0.0,0.0,0.034061,0.0,0.008277,0.0,0.0,0.0,0.037215,0.0,0.0,0.0,0.0,0.0,0.0,0.012586,0.0,0.0,0.0,0.024139,0.0,0.0,0.0,0.0,0.082893,0.0,0.0,0.0,0.0,0.144325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013684,0.0,0.0,0.0,0.0,0.0,0.050512,0.0,0.237479,0.014134,0.0,0.0,0.0,0.016854,0.0,0.0,0.0,0.087035,0.026124,0.0,0.040897,0.0,0.056103
10,GBR,0.0,0.0,0.0,0.02045,0.011704,0.0,0.013909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020333,0.0,0.0,0.0,0.097549,0.042538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042302,0.015138,0.021026,0.0,0.02565,0.091958,0.0,0.0,0.0,0.0,0.037523,0.063349,0.0,0.0,0.0,0.017919,0.0,0.022918,0.0,0.032732,0.019876,0.0,0.0,0.180924,0.09307,0.0,0.0,0.0,0.015363,0.010284,0.0,0.0,0.0,0.0,0.05728,0.0,0.020123,0.0,0.0,0.0,0.017385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016337,0.0,0.0,0.0,0.069346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014812,0.029312,0.043192,0.018762,0.0,0.0,0.100167,0.023409,0.0,0.122611,0.0,0.016913,0.0,0.0,0.0,0.0,0.067653,0.0,0.0,0.0,0.0,0.0,0.0,0.045979,0.0,0.0,0.038645,0.0,0.018942,0.072684,0.0,0.0,0.0,0.0,0.040666,0.0,0.0,0.0,0.0,0.0,0.008886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06774,0.0,0.019214,0.0,0.017159,0.0,0.0,0.061856,0.0,0.0,0.0,0.0,0.054828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032675,0.0,0.0,0.0,0.0,0.0,0.040205,0.0,0.099621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015595,0.0,0.024415,0.0,0.0
95,GBR,0.0,0.0,0.0,0.007661,0.017538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020499,0.021129,0.0,0.0,0.015234,0.0,0.0,0.020499,0.190022,0.03187,0.0,0.0,0.0,0.011341,0.080615,0.012965,0.0,0.046737,0.021129,0.0,0.047259,0.0,0.019217,0.155017,0.0,0.0,0.0,0.009445,0.0,0.059327,0.009752,0.011425,0.0,0.0,0.0,0.0,0.0,0.042915,0.007446,0.0,0.0,0.082837,0.017432,0.0,0.023036,0.0,0.0,0.015409,0.064932,0.0,0.0,0.0,0.061307,0.0,0.0,0.0,0.0,0.0,0.032562,0.328255,0.0,0.0,0.0,0.0,0.0,0.030122,0.024481,0.008822,0.0,0.0,0.127002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013795,0.012965,0.328255,0.0,0.0,0.033532,0.0,0.015321,0.03187,0.068681,0.008212,0.0,0.0,0.0,0.011097,0.0,0.010787,0.0,0.0,0.0,0.121229,0.0,0.044299,0.137792,0.03081,0.038015,0.015238,0.014477,0.0,0.011097,0.019007,0.022431,0.0,0.0,0.0,0.0,0.014617,0.08612,0.0,0.023712,0.014477,0.0,0.014192,0.027228,0.0,0.0,0.0,0.0,0.007617,0.012856,0.022211,0.01151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025376,0.0,0.007198,0.0,0.0,0.0,0.009327,0.046343,0.0,0.0,0.0,0.0,0.041078,0.0,0.0,0.009505,0.0,0.012965,0.0,0.0,0.0,0.0,0.0,0.024481,0.0,0.0,0.0,0.0,0.0,0.015061,0.0,0.137792,0.0,0.0,0.0,0.044299,0.030153,0.0,0.015935,0.0,0.035388,0.0,0.0,0.012195,0.0,0.0
90,GBR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010394,0.0,0.0,0.0,0.0,0.0,0.023858,0.036886,0.0,0.0,0.0,0.0,0.0,0.0,0.136098,0.074185,0.0,0.0,0.0,0.0132,0.0,0.0,0.0,0.0,0.036886,0.0,0.0,0.0,0.0671,0.100232,0.0,0.009229,0.0,0.021986,0.0,0.04143,0.0,0.026595,0.0,0.0,0.0,0.0,0.0,0.028541,0.025997,0.0,0.0,0.026294,0.010144,0.0,0.026811,0.0,0.0,0.017934,0.018893,0.0,0.0,0.0,0.035677,0.0,0.0,0.0,0.0,0.0,0.060637,0.050939,0.0,0.0,0.0,0.0,0.0,0.052588,0.0,0.010268,0.0,0.0,0.201563,0.0,0.05481,0.076394,0.0,0.0,0.0,0.0,0.016056,0.0,0.050939,0.014023,0.014023,0.013009,0.0,0.008916,0.037093,0.019984,0.0,0.0,0.0,0.01436,0.0,0.0,0.025108,0.0,0.0,0.0,0.114219,0.010206,0.085929,0.086868,0.0,0.029496,0.0,0.0,0.0,0.012915,0.022122,0.0,0.0,0.0,0.0,0.0,0.0,0.060139,0.0,0.0,0.033697,0.0,0.0,0.063379,0.0,0.0,0.008967,0.0,0.0,0.0,0.034467,0.0,0.0,0.0,0.023246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.059068,0.0,0.008377,0.0,0.0,0.0,0.0,0.021575,0.0,0.014246,0.0,0.0,0.00683,0.0,0.0,0.022126,0.0,0.01509,0.0,0.0,0.0,0.0,0.0,0.014246,0.0,0.0,0.016206,0.0,0.0,0.008765,0.0,0.09355,0.0,0.0,0.0,0.085929,0.017547,0.0,0.0,0.0,0.057662,0.013599,0.0,0.070964,0.0,0.038939
26,RUS,0.0,0.0,0.0,0.043454,0.043524,0.0,0.022167,0.044327,0.021016,0.0,0.0,0.0,0.0,0.0,0.007491,0.0,0.0,0.027004,0.0,0.0,0.014535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007491,0.024125,0.0,0.022598,0.063589,0.138411,0.0,0.050604,0.0,0.006697,0.109634,0.113579,0.0,0.0,0.069657,0.0,0.0,0.0,0.0,0.030429,0.036955,0.0,0.0,0.058736,0.018541,0.0,0.0,0.0,0.0,0.021852,0.02302,0.0,0.0,0.0,0.017388,0.009605,0.0,0.0,0.0,0.0,0.0,0.0,0.085524,0.0,0.0,0.0,0.0,0.064075,0.0,0.0,0.0,0.024125,0.12689,0.062067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017087,0.02563,0.0,0.0,0.005432,0.0,0.0,0.005823,0.0,0.0,0.0,0.0,0.020762,0.0,0.0,0.0,0.0,0.028653,0.0,0.04188,0.097702,0.0,0.026955,0.0,0.071852,0.0,0.0,0.022462,0.0,0.0,0.0,0.023141,0.0,0.0,0.146553,0.0,0.0,0.092382,0.0,0.0,0.091703,0.060274,0.0,0.010926,0.0,0.037806,0.018231,0.073492,0.016323,0.0,0.0,0.014162,0.010805,0.0,0.027347,0.0,0.0,0.0,0.0,0.014535,0.0,0.0,0.056139,0.061567,0.018231,0.0,0.026453,0.262878,0.030325,0.026037,0.038076,0.015905,0.249653,0.0,0.035566,0.094358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049366,0.0,0.0,0.192226,0.0,0.101773,0.0,0.077306,0.029555,0.04188,0.01069,0.0,0.0,0.0,0.110405,0.0,0.0,0.116728,0.0,0.0


In [12]:
df_test_chi2.head()

Unnamed: 0,country,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
160,GBR,0.0,0.0,0.024494,0.014101,0.032283,0.0,0.019182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043015,0.0,0.0,0.0,0.0,0.011792,0.06341,0.0,0.0,0.0,0.0,0.0,0.032762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014182,0.01494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01624,0.0,0.0,0.159395,0.0,0.0,0.342333,0.0,0.0,0.0,0.0,0.0,0.071598,0.0,0.0,0.0,0.0,0.0,0.042304,0.0,0.0,0.015117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08501,0.0,0.0,0.243073,0.0,0.058313,0.0,0.026648,0.0,0.0,0.046651,0.0,0.0,0.0,0.0,0.0,0.0,0.06341,0.0,0.0,0.0,0.0,0.0,0.01253,0.0,0.0,0.0,0.0,0.0,0.023665,0.0,0.0,0.0,0.0,0.012255,0.0,0.0,0.0,0.0,0.0,0.087296,0.020725,0.0,0.01557,0.0,0.013249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129624,0.0,0.0,0.0,0.0,0.071598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130282,0.0,0.0,0.033671,0.0,0.0
178,CHN,0.0,0.0,0.0,0.0,0.012744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016483,0.351482,0.056529,0.0,0.0,0.0,0.0,0.0,0.0,0.121028,0.158538,0.0,0.0,0.0,0.013727,0.0,0.327647,0.056694,0.0,0.0,0.0,0.0,0.0,0.0,0.03564,0.064925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011197,0.0,0.0,0.0,0.0,0.01782,0.0,0.109557,0.0,0.0,0.0,0.0,0.0,0.0,0.035576,0.0,0.0,0.0,0.010945,0.0,0.0,0.0,0.0,0.125848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017511,0.070044,0.0,0.0,0.0,0.0,0.0,0.035805,0.0,0.0,0.017932,0.0,0.0,0.0,0.020429,0.0,0.0,0.0,0.089211,0.0,0.14185,0.0,0.018416,0.0,0.126236,0.0,0.032255,0.073665,0.0,0.0,0.0,0.0,0.0,0.0,0.208603,0.0,0.0,0.01052,0.0,0.0,0.0,0.0,0.0,0.134369,0.0,0.01107,0.0,0.032279,0.050185,0.0,0.0,0.067733,0.0,0.0,0.0,0.0,0.0,0.0,0.065451,0.0,0.049173,0.01917,0.0,0.0,0.0,0.0,0.013555,0.0,0.0,0.0,0.0,0.0,0.051171,0.0,0.018225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028172,0.017789,0.0,0.0,0.020237,0.0,0.0,0.0,0.0,0.133506,0.03675,0.0,0.015145,0.0,0.0,0.0,0.0,0.0,0.020572,0.0,0.065199,0.008861,0.0,0.0
130,GBR,0.0,0.0,0.016752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0133,0.024404,0.0,0.019179,0.0,0.0,0.0,0.110413,0.020062,0.0,0.0,0.0,0.0,0.025373,0.016323,0.0,0.0,0.0133,0.0,0.0,0.0,0.008065,0.173474,0.0,0.0,0.0,0.0,0.0,0.351045,0.0,0.0,0.0,0.016901,0.0,0.0,0.0,0.038592,0.0,0.0,0.0,0.0,0.03292,0.0,0.0,0.0,0.0,0.096997,0.040873,0.0,0.0,0.0,0.038592,0.0,0.0,0.0,0.0,0.0,0.065591,0.275505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014278,0.065409,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.097937,0.275505,0.0,0.0,0.0,0.0,0.009644,0.0,0.0,0.010339,0.0,0.0,0.0,0.0,0.0,0.0,0.017696,0.0,0.0,0.101748,0.02208,0.0,0.101193,0.0,0.063812,0.0,0.009113,0.0,0.0,0.007976,0.028239,0.0,0.0,0.0,0.0,0.018402,0.021684,0.0,0.0,0.018225,0.0,0.017866,0.00857,0.0,0.014278,0.019399,0.0,0.0,0.0,0.018641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011742,0.0,0.0,0.0,0.0,0.0,0.036939,0.0,0.0,0.0,0.0,0.0,0.0,0.050617,0.0,0.019394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028442,0.0,0.15179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00891,0.02942,0.0,0.015352,0.0,0.0
151,RUS,0.0,0.0,0.0,0.064146,0.024476,0.0,0.014543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132367,0.0,0.014743,0.0,0.021984,0.0,0.062579,0.03205,0.03337,0.011067,0.0,0.0,0.0,0.099355,0.0,0.0,0.087245,0.0,0.0,0.0,0.0,0.025668,0.020782,0.0,0.0,0.021019,0.0,0.0,0.0,0.0,0.0,0.0,0.022654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009089,0.0,0.028055,0.0,0.0,0.0,0.0,0.01051,0.0,0.0,0.106377,0.0,0.257804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015599,0.0,0.0,0.0,0.0,0.011461,0.0,0.0,0.086096,0.0,0.010216,0.0,0.019617,0.0,0.0,0.064451,0.0,0.0,0.216336,0.0,0.035368,0.0,0.090914,0.0,0.0,0.061895,0.0,0.0,0.0,0.068319,0.0,0.0,0.056087,0.0,0.0,0.0,0.0,0.0,0.037998,0.0,0.0,0.032257,0.034162,0.0,0.0,0.0,0.0,0.066716,0.0,0.018583,0.0,0.0,0.0,0.253317,0.068319,0.0,0.015712,0.0,0.0,0.0,0.02009,0.0,0.017941,0.0,0.013016,0.0,0.0,0.051246,0.018735,0.0,0.065516,0.0,0.0175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027052,0.0,0.0,0.0,0.019433,0.0,0.0,0.0,0.0,0.240374,0.0,0.0,0.014543,0.0,0.0,0.035397,0.0,0.0,0.059264,0.0,0.0,0.017018,0.0,0.0
80,GBR,0.0,0.0,0.011423,0.013152,0.015055,0.0,0.0,0.030665,0.008481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019471,0.0173,0.0,0.0,0.03009,0.0,0.0,0.0,0.0,0.076984,0.19221,0.0,0.013614,0.0,0.032432,0.0,0.066206,0.0,0.0,0.022999,0.011524,0.0,0.0,0.0,0.052627,0.012783,0.0,0.0,0.077573,0.104749,0.0,0.0,0.0,0.0,0.006614,0.006967,0.0,0.0,0.0,0.068415,0.0,0.0,0.0,0.0,0.0,0.050314,0.0,0.0,0.0,0.0,0.0,0.0,0.019393,0.010507,0.007573,0.0,0.0,0.059466,0.0,0.020213,0.018782,0.0,0.0,0.0,0.0,0.023684,0.022259,0.0,0.0,0.0,0.01919,0.0,0.026304,0.041037,0.0,0.021148,0.0,0.0,0.021183,0.0,0.006284,0.055556,0.024133,0.0,0.0,0.05451,0.007528,0.0,0.103498,0.0,0.04351,0.0,0.006213,0.0,0.0,0.021755,0.0,0.0,0.0,0.014008,0.0,0.025095,0.06407,0.0,0.0,0.024854,0.0,0.012182,0.093491,0.0,0.0,0.019841,0.0,0.0,0.0,0.031776,0.009881,0.0,0.0,0.01143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050827,0.011323,0.012357,0.0,0.0,0.0,0.016013,0.071607,0.0,0.010507,0.0,0.0,0.080598,0.0,0.0,0.00816,0.012807,0.01113,0.0,0.0,0.0,0.0,0.0,0.021014,0.0,0.325677,0.011953,0.0,0.0,0.025858,0.0,0.093641,0.010853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012151,0.04012,0.0,0.015702,0.0,0.0


## Step 6 - Classification using sklearn LinearSVC

In [13]:
lsvc = LinearSVC(verbose=0)
print(lsvc)

LinearSVC()


In [14]:
lsvc.fit(X_train_chi2, y_train)
score = lsvc.score(X_train_chi2, y_train)
print("Score: ", score)

Score:  0.9945054945054945


In [15]:
ypred = lsvc.predict(X_test_chi2)

cm = confusion_matrix(y_test, ypred)
print(cm)

[[ 7  0  0  0  0]
 [ 0  7  0  0  0]
 [ 0  0  9  0  1]
 [ 0  0  0 12  0]
 [ 0  0  0  0 10]]


## Step 7 - Classification using PyCaret

In [16]:
experiment = setup(data=df_train_chi2, target="country", test_data=df_test_chi2, 
    silent=True, fold=5, session_id=123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,country
2,Target Type,Multiclass
3,Label Encoded,"CHN: 0, FRA: 1, GBR: 2, RUS: 3, USA: 4"
4,Original Data,"(182, 201)"
5,Missing Values,False
6,Numeric Features,199
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [21]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9836,0.9991,0.9836,0.9849,0.9836,0.9795,0.9799,0.138
ridge,Ridge Classifier,0.9673,0.0,0.9686,0.9716,0.9669,0.9591,0.9605,0.008
rf,Random Forest Classifier,0.967,0.9989,0.9664,0.9722,0.9667,0.9587,0.9601,0.166
lr,Logistic Regression,0.9616,0.9927,0.9621,0.9652,0.9609,0.9519,0.9531,0.018
lightgbm,Light Gradient Boosting Machine,0.9616,0.999,0.9621,0.9649,0.9611,0.9519,0.9529,0.11
gbc,Gradient Boosting Classifier,0.9279,0.988,0.9279,0.9378,0.93,0.9099,0.9116,0.7
knn,K Neighbors Classifier,0.9228,0.9893,0.9229,0.9302,0.9226,0.9034,0.9055,0.016
svm,SVM - Linear Kernel,0.9227,0.0,0.9207,0.9409,0.9191,0.903,0.9087,0.014
dt,Decision Tree Classifier,0.8619,0.9141,0.8598,0.8796,0.8579,0.8271,0.8322,0.01
lda,Linear Discriminant Analysis,0.8462,0.9444,0.8457,0.8644,0.845,0.8076,0.812,0.014


In [22]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [23]:
predict_model(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,194,195,196,197,198,199,0_0.0,country,Label,Score
0,0.0,0.024494,0.014101,0.032283,0.0,0.019182,0.0,0.0,0.0,0.0,...,0.130282,0.0,0.0,0.033671,0.0,0.0,1.0,GBR,GBR,0.49
1,0.0,0.0,0.0,0.012744,0.0,0.0,0.0,0.0,0.0,0.0,...,0.020572,0.0,0.065199,0.008861,0.0,0.0,1.0,CHN,CHN,0.83
2,0.0,0.016752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00891,0.02942,0.0,0.015352,0.0,0.0,1.0,GBR,GBR,0.72
3,0.0,0.0,0.064146,0.024476,0.0,0.014543,0.0,0.0,0.0,0.0,...,0.059264,0.0,0.0,0.017018,0.0,0.0,1.0,RUS,RUS,0.75
4,0.0,0.011423,0.013152,0.015055,0.0,0.0,0.030665,0.008481,0.0,0.0,...,0.012151,0.04012,0.0,0.015702,0.0,0.0,1.0,GBR,GBR,0.47
5,0.033253,0.0,0.021486,0.012297,0.0,0.0,0.0,0.013855,0.0,0.0,...,0.059552,0.0,0.0,0.017101,0.0,0.0,1.0,RUS,RUS,0.76
6,0.0,0.019977,0.023002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.017541,0.0,0.045769,0.0,0.0,1.0,GBR,GBR,0.65
7,0.0,0.0,0.043109,0.008224,0.0,0.009774,0.016752,0.018533,0.0,0.0,...,0.026552,0.021917,0.0,0.08006,0.0,0.0,1.0,FRA,FRA,0.68
8,0.0,0.023913,0.0,0.0,0.0,0.009363,0.168518,0.044388,0.0,0.0,...,0.063595,0.010499,0.0,0.021914,0.0,0.0,1.0,USA,USA,0.72
9,0.0,0.0,0.0,0.019128,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0931,0.0,0.0,1.0,GBR,GBR,0.48
