In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [45]:
languages = ['Java', 'JavaScript', 'PHP', 'Python', 'SQL', 'C++', 'C']
d ={}
for language in languages:
    d[language] =[]

In [None]:
for language in languages:
    # print(language)
    df = pd.read_csv(f'data/normalized/language/{language}.csv')
    labels=['low','medium','high']

    df[f'{language}_wiki_label'] = df[f'{language}_wiki_label'].astype('category')
    df[f'{language}_wiki_label_encoded'] = df[f'{language}_wiki_label'].cat.codes
    X = df[[f'{language}_gtrend']]
    y = df[f'{language}_wiki_label_encoded']  

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    svc_model = SVC(random_state=42)
    svc_model.fit(X_train, y_train)
    y_pred = svc_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    svc_cv_scores = cross_val_score(svc_model, X, y, cv=5, scoring='accuracy') 
    d[language].append(svc_cv_scores.mean())

    maximum_knn_score =0
    for nilai in range(1,20):
        knn_model = KNeighborsClassifier(n_neighbors=nilai)  
        knn_model.fit(X_train, y_train)
        y_pred_knn = knn_model.predict(X_test)
        knn_accuracy = accuracy_score(y_test, y_pred_knn)
        knn_cv_scores = cross_val_score(knn_model, X, y, cv=5, scoring='accuracy')
        maximum_knn_score = max(knn_cv_scores.mean(), maximum_knn_score)
    d[language].append(maximum_knn_score)


    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    pca = PCA(n_components=1) 
    X_pca = pca.fit_transform(X_scaled)
    X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=42)
    pca_model = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42)
    pca_model.fit(X_train_pca, y_train_pca)
    y_pred_pca = pca_model.predict(X_test_pca)
    pca_cv_scores = cross_val_score(pca_model, X_pca, y, cv=5, scoring='accuracy')
    d[language].append(pca_cv_scores.mean())

    mlp_model = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42)
    mlp_model.fit(X_train, y_train)
    y_pred_mlp = mlp_model.predict(X_test)
    mlp_cv_scores = cross_val_score(mlp_model, X, y, cv=5, scoring='accuracy')
    d[language].append(mlp_cv_scores.mean())

    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)
    y_pred_nb = nb_model.predict(X_test)
    nb_accuracy = accuracy_score(y_test, y_pred_nb)
    nb_cv_scores = cross_val_score(nb_model, X, y, cv=5, scoring='accuracy')
    d[language].append(nb_cv_scores.mean())

    lr_model = LogisticRegression(max_iter=1000, random_state=42)
    lr_model.fit(X_train, y_train)
    y_pred_lr = lr_model.predict(X_test)
    lr_accuracy = accuracy_score(y_test, y_pred_lr)
    lr_cv_scores = cross_val_score(lr_model, X, y, cv=5, scoring='accuracy')
    d[language].append(lr_cv_scores.mean())
    




In [47]:
for key, val in d.items():
    print(f'{key} : ', end='')
    for v in val:
        print("{:.2f}".format(v), end=',  ')
    print()
    

Java : 0.37,  0.47,  0.41,  0.41,  0.42,  0.48,  
JavaScript : 0.46,  0.50,  0.40,  0.46,  0.46,  0.49,  
PHP : 0.66,  0.67,  0.63,  0.62,  0.63,  0.61,  
Python : 0.48,  0.53,  0.51,  0.51,  0.52,  0.44,  
SQL : 0.34,  0.44,  0.39,  0.38,  0.36,  0.43,  
C++ : 0.51,  0.50,  0.42,  0.46,  0.45,  0.47,  
C : 0.38,  0.34,  0.31,  0.25,  0.23,  0.28,  
