In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

In [10]:
language = 'Java'
df = pd.read_csv(f'../../data/normalized/{language}.csv')

df[f'{language}_tiobe_label'] = df[f'{language}_tiobe_label'].astype('category')
df[f'{language}_tiobe_label_encoded'] = df[f'{language}_tiobe_label'].cat.codes

X = df[[f'{language}_wiki', f'{language}_gtrend']]
y = df[f'{language}_tiobe_label_encoded']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train1, X_test, y_train1, y_test = train_test_split(X_scaled, y, test_size=0.4, random_state=42, stratify=y)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train1, y_train1)

models = {
    "Random Forest": RandomForestClassifier(bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200),
    "Decision Tree": DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_leaf=1, min_samples_split=2, splitter='best'),
    "XGBoost": XGBClassifier(colsample_bytree=1.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8),
    "LightGBM": LGBMClassifier(colsample_bytree=0.8, learning_rate=0.2, max_depth=5, min_child_samples=10, n_estimators=100, subsample=0.8),
    "SVC": SVC(C=1, kernel='rbf', probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=9),
    "MLP": MLPClassifier(activation='relu', hidden_layer_sizes=(50, 100), max_iter=500),
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(C=10)
}


for model_name, model in models.items():
    model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 34, number of negative: 34
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41
[LightGBM] [Info] Number of data points in the train set: 68, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




In [11]:
def predict_user_input(models, language, df):
    print("Enter the following feature values (unnormalized):")
    wiki = float(input(f"Enter the {language}_wiki value: "))
    gtrend = float(input(f"Enter the {language}_gtrend value: "))
    
    user_input = pd.DataFrame([[wiki, gtrend]], columns=[f'{language}_wiki', f'{language}_gtrend']) 
    
    predictions = {}
    for model_name, model in models.items():
        pred_encoded = model.predict(user_input)[0]  
        pred_label = df[f'{language}_tiobe_label'].cat.categories[pred_encoded]  
        predictions[model_name] = pred_label
    
    for model_name, prediction in predictions.items():
        print(f"{model_name} prediction: {prediction}")


predict_user_input(models, language, scaler, df)

Enter the following feature values (unnormalized):
Random Forest prediction: high
Decision Tree prediction: high
XGBoost prediction: high
LightGBM prediction: high
SVC prediction: high
KNN prediction: high
MLP prediction: low
Naive Bayes prediction: low
Logistic Regression prediction: high
