In [1]:
import pandas as pd, numpy as np, ast, re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

In [2]:
def parse_np_array(array_string):
    pattern = r'''# Match (mandatory) whitespace between...
              (?<=\]) # ] and
              \s+
              (?= \[) # [, or
              |
              (?<=[^\[\]\s]) 
              \s+
              (?= [^\[\]\s]) # two non-bracket non-whitespace characters
           '''
    fixed_string = re.sub(pattern, ',', array_string, flags=re.VERBOSE)
    return np.array(ast.literal_eval(fixed_string))

In [3]:
df = pd.read_csv("labeled_bow.csv", index_col="ID")
df["bow_word2vec"] = df["bow_word2vec"].apply(lambda x: parse_np_array(x) if type(x) == str and "[" in x else None)
df["bow_clust2vec"] = df["bow_clust2vec"].apply(lambda x: parse_np_array(x) if type(x) == str and "[" in x else None)
df.head(5)

Unnamed: 0_level_0,Score,bow_word2vec,bow_clust2vec,language
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,9,"[0.0227469543, 0.0409026681, 0.0287264666, 0.0...","[0.04031772, 0.0288417, 0.00614185, 0.06809973...",en
1,10,"[0.0410866831, 0.0234945428, 0.0323280634, 0.0...","[0.03501698, 0.02333498, 0.01251062, 0.0569246...",en
2,8,"[0.05490769, 0.03248798, 0.0296512, 0.07768815...","[0.04587762, 0.0282501, 0.00382094, 0.06212483...",en
3,4,"[0.0286348588, 0.0208428292, 0.0139906317, 0.1...","[0.04349127, 0.02652042, -0.00698592, 0.065983...",en
4,7,"[0.015773149, 0.0392650257, 0.0291523327, 0.09...","[0.0409446889, 0.0242504175, 0.012562747, 0.07...",en


In [4]:
y = np.array(df["Score"])
X_w2v = np.array(list(df["bow_word2vec"]))
X_c2v = np.array(list(df["bow_clust2vec"]))

In [5]:
ss = StandardScaler()
X_w2v = ss.fit_transform(X_w2v)
X_c2v = ss.fit_transform(X_c2v)

In [6]:
wX_train, wX_test, y_train, y_test = train_test_split(X_w2v, y, test_size=0.30, random_state=42)
cX_train, cX_test, y_train, y_test = train_test_split(X_c2v, y, test_size=0.30, random_state=42)
wX_train.shape, wX_test.shape

((700, 300), (300, 300))

In [7]:
def eval_model(model, X_train, X_test, y_train, y_test, fit=True):
    if fit:
        model.fit(X_train, y_train)
    predicted_y = model.predict(X_test)
    return 1 - np.sum(np.abs(y_test - predicted_y))/(len(y_test)*10)

In [8]:
sgd_clf = SGDClassifier(random_state=42, max_iter=200, alpha=0.01, )
rf_clf = RandomForestClassifier(random_state=42, n_estimators=5)
svc_clf = LinearSVC(random_state=42, multi_class="crammer_singer")
# lr_clf = LogisticRegression(random_state=42, C=0.5, max_iter=200, multi_class="multinomial")
mlp_clf = MLPClassifier(random_state=42, max_iter=500)

In [9]:
for model in ["sgd_clf", "rf_clf", "svc_clf", "mlp_clf"]:
    print(model)
    print("BOW_word2vec: ",eval_model(globals()[model], wX_train, wX_test, y_train, y_test))
    print("BOW_clust2vec: ",eval_model(globals()[model], cX_train, cX_test, y_train, y_test))

sgd_clf
BOW_word2vec:  0.821333333333
BOW_clust2vec:  0.804333333333
rf_clf
BOW_word2vec:  0.772
BOW_clust2vec:  0.770666666667
svc_clf
BOW_word2vec:  0.780666666667
BOW_clust2vec:  0.762333333333
mlp_clf
BOW_word2vec:  0.811666666667
BOW_clust2vec:  0.810333333333


# Let's refactor the code into a single class

In [10]:
class CLassification:
    def __init__(self, df_path, only_tr=False):
        self.df = self.load_df(df_path, only_tr)
        
        self.X_w2v, self.X_c2v, self.y = self.load_Xy()
        
        self.wX_train, self.wX_test, self.y_train, self.y_test = train_test_split(\
            X_w2v, y, test_size=0.30, random_state=42)
        
        self.cX_train, self.cX_test, self.y_train, self.y_test = train_test_split(\
            X_c2v, y, test_size=0.30, random_state=42)
        
        
    def load_df(self, df_path, only_tr):
        df = pd.read_csv(df_path, index_col="ID")
        if only_tr:
            df = df[df["language"] == "tr"]
        df["bow_word2vec"] = df["bow_word2vec"].apply(lambda x: parse_np_array(x) if type(x) == str and "[" in x else None)
        df["bow_clust2vec"] = df["bow_clust2vec"].apply(lambda x: parse_np_array(x) if type(x) == str and "[" in x else None)
        return df

    def load_Xy(self):
        y = np.array(self.df["Score"])
        X_w2v = self.scale_X(np.array(list(self.df["bow_word2vec"])))
        X_c2v = self.scale_X(np.array(list(self.df["bow_clust2vec"])))
        return X_w2v, X_c2v, y
    
    def scale_X(self, X):
        ss = StandardScaler()
        X = ss.fit_transform(X)
        
    def eval_model(self, model, X_train, X_test, y_train, y_test, fit=True):
        if fit:
            model.fit(X_train, y_train)
        predicted_y = model.predict(X_test)
        return 1 - np.sum(np.abs(y_test - predicted_y))/(len(y_test)*10) 
    
    def eval_data(self):
        sgd_clf = SGDClassifier(random_state=42, max_iter=200, alpha=0.01, )
        rf_clf = RandomForestClassifier(random_state=42, n_estimators=5)
        svc_clf = LinearSVC(random_state=42, multi_class="crammer_singer")
        # lr_clf = LogisticRegression(random_state=42, C=0.5, max_iter=200, multi_class="multinomial")
        mlp_clf = MLPClassifier(random_state=42, max_iter=500)
        for model in ["sgd_clf", "rf_clf", "svc_clf", "mlp_clf"]:
            print(model)
            print("BOW_word2vec: ",eval_model(locals()[model], self.wX_train, self.wX_test, self.y_train, self.y_test))
            print("BOW_clust2vec: ",eval_model(locals()[model], self.cX_train, self.cX_test, self.y_train, self.y_test))

# Now let's try Pseudo-Cross-Lingual

In [11]:
cl = CLassification("labeled_bow_pcl.csv")  ## pcl: pseudo-cross-lingual
cl.eval_data()

sgd_clf
BOW_word2vec:  0.821333333333
BOW_clust2vec:  0.804333333333
rf_clf
BOW_word2vec:  0.772
BOW_clust2vec:  0.770666666667
svc_clf
BOW_word2vec:  0.780666666667
BOW_clust2vec:  0.762333333333
mlp_clf
BOW_word2vec:  0.811666666667
BOW_clust2vec:  0.810333333333


# Training and testing only on Turkish reviews

In [12]:
cl = CLassification("labeled_bow_pcl.csv", only_tr=True)  ## pcl: pseudo-cross-lingual
cl.eval_data()

sgd_clf
BOW_word2vec:  0.821333333333
BOW_clust2vec:  0.804333333333
rf_clf
BOW_word2vec:  0.772
BOW_clust2vec:  0.770666666667
svc_clf
BOW_word2vec:  0.780666666667
BOW_clust2vec:  0.762333333333
mlp_clf
BOW_word2vec:  0.811666666667
BOW_clust2vec:  0.810333333333


In [13]:
cl = CLassification("labeled_bow.csv", only_tr=True)  ## pcl: pseudo-cross-lingual
cl.eval_data()

sgd_clf
BOW_word2vec:  0.821333333333
BOW_clust2vec:  0.804333333333
rf_clf
BOW_word2vec:  0.772
BOW_clust2vec:  0.770666666667
svc_clf
BOW_word2vec:  0.780666666667
BOW_clust2vec:  0.762333333333
mlp_clf
BOW_word2vec:  0.811666666667
BOW_clust2vec:  0.810333333333
