### OneR algorithm

In [593]:
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import copy

In [594]:
X_train = pd.read_csv("X_train.csv", sep = ',')
y_train = pd.read_csv("y_train.csv", sep = ',')
X_test = pd.read_csv("X_test.csv", sep = ',')
y_test = pd.read_csv("y_test.csv", sep = ',')

merged_train_data = pd.concat([X_train, y_train], axis=1)
train_data = merged_train_data

merged_test_data = pd.concat([X_test, y_test], axis=1)
test_data = merged_test_data

In [595]:
oneR_train_data = train_data.copy()

In [596]:
def get_predictor_value(dataframe, column, predicted_variable):
    split = (dataframe[column][dataframe[predicted_variable] == 0].mean() + dataframe[column][dataframe[predicted_variable] == 1].mean())/2
    group = dataframe[dataframe[column] > split].groupby(predicted_variable).size()
    left = 0
    right = 0
    if len(group) > 1:
        if group.get(0, 0) > group.get(1, 0):
            left = 1
            right = 0
        else:
            left = 0
            right = 1

    return split, left, right

In [597]:
def oneR(train_data, column, data, predicted_variable):
    train_data.loc[train_data[column] < data[0], 'temp'] = data[1]
    train_data.loc[train_data[column] >= data[0], 'temp'] = data[2]
    accuracy = accuracy_score(train_data[predicted_variable], train_data['temp'])
    precision = precision_score(train_data[predicted_variable], train_data['temp'], zero_division=1)
    recall = recall_score(train_data[predicted_variable], train_data['temp'])
    total=(accuracy + precision + recall) / 3
    
    return [accuracy, precision, recall, total, train_data]

In [598]:
predicted_variable = 'ack'

best_accuracy = 0 
for column in oneR_train_data.columns:
    if column == predicted_variable:
        continue
    data = get_predictor_value(oneR_train_data, column, predicted_variable)
    accuracy_values = oneR(oneR_train_data, column, data, predicted_variable)

    train_data = accuracy_values[4]
    train_data.drop('temp', axis=1, inplace=True)
    

    if accuracy_values[3] > best_accuracy:
        best_column = column
        best_accuracy = accuracy_values[3]
        best_column_metrics = accuracy_values


print('best column from oneR: ' + best_column)
print('total: ' + str(best_column_metrics[3]))
print('accuracy: ' + str(best_column_metrics[0]))
print('precision: ' + str(best_column_metrics[1]))
print('recall: ' + str(best_column_metrics[2]))



best column from oneR: page_activity_duration
total: 0.900567200058147
accuracy: 0.8994723687864482
precision: 0.8568387440127727
recall: 0.9453904873752202


In [599]:
results = []
explored_columns = []

for _ in range(1):
    best_accuracy = 0 
    for column in oneR_train_data.columns:
        if column == predicted_variable or column in explored_columns:
            continue
        data = get_predictor_value(oneR_train_data, column, predicted_variable)
        accuracy_values = oneR(oneR_train_data, column, data, predicted_variable)

        if accuracy_values[3] > best_accuracy:
            best_column = column
            best_accuracy = accuracy_values[3]
            best_column_metrics = accuracy_values
            oneR_train_data = accuracy_values[4]

    print('best column from oneR: ' + best_column + 'accuracy: ' + str(best_accuracy))  
    explored_columns.append(best_column)
    result = oneR_train_data['temp'].tolist()
    results.append(result)
    oneR_train_data.drop('temp', axis=1, inplace=True)
    oneR_train_data.drop(best_column, axis=1, inplace=True)


final_values = []
for col in zip(*results):
    count_0 = col.count(0)
    count_1 = col.count(1)
    final_value = 0 if count_0 > count_1 else 1
    final_values.append(final_value)

accuracy = accuracy_score(oneR_train_data[predicted_variable], final_values)
precision = precision_score(oneR_train_data[predicted_variable], final_values, zero_division=1)
recall = recall_score(oneR_train_data[predicted_variable], final_values)

print("accuracy: " + str(accuracy))
print("precision: " + str(precision))
print("recall: " + str(recall))

best column from oneR: page_activity_durationaccuracy: 0.900567200058147
accuracy: 0.5290197167453485
precision: 0.5019347705914871
recall: 0.533176746917205


In [600]:
oneR_train_data.head(10)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,sex_F,sex_M,new_user_id,race_0,race_1,...,pct_click_product_info,scroll_move_total_rel_distance,total_load_time,screen_width,screen_height,birthdate,registration,timestamp,session_start,ack
0,0,0,1,0,0,1.0,0.0,782,0,0,...,23.91551,15.12649,7.79057,1280,1080,19650808.0,20200918.0,1570731284,1570731284,1.0
1,0,1,0,0,0,1.0,0.0,2605,0,1,...,47.54284,14.08687,4.86658,1920,1080,19390123.0,20190605.0,1558918300,1558918300,1.0
2,0,0,1,0,0,1.0,0.0,2605,0,1,...,59.59391,13.40058,2.31779,1024,768,19390123.0,20190605.0,1685228037,1685228037,1.0
3,0,1,0,0,0,1.0,0.0,2675,0,1,...,47.54284,14.08687,4.86658,1920,1080,20140911.0,20181023.0,1558918300,1558918300,1.0
4,0,0,1,0,0,1.0,0.0,2675,0,1,...,59.59391,13.40058,2.31779,1024,768,20140911.0,20181023.0,1685228037,1685228037,1.0
5,0,0,0,0,1,1.0,0.0,2204,1,0,...,39.1952,12.8379,3.03892,1920,1080,20030224.0,20190516.0,1652027250,1652027250,1.0
6,0,0,0,1,0,1.0,0.0,2204,0,0,...,32.43102,13.68771,1.67152,0,0,20030224.0,20190516.0,1669868313,1669868313,1.0
7,0,0,0,0,1,1.0,0.0,3348,0,1,...,68.01452,14.3258,5.27451,1280,1080,19320717.0,20200622.0,1631021903,1631021903,1.0
8,0,0,0,1,0,1.0,0.0,3348,0,0,...,60.97749,15.418,5.86217,1920,1024,19320717.0,20200622.0,1601176631,1601176631,0.0
9,0,0,0,0,1,1.0,0.0,3348,0,1,...,23.57572,12.9265,5.98495,1920,1080,19320717.0,20200622.0,1674979368,1674979368,0.0


In [601]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)

# Calculate precision
precision = precision_score(y_test, predictions)

# Calculate recall
recall = recall_score(y_test, predictions)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

  


Accuracy: 0.96
Precision: 0.95
Recall: 0.97


In [602]:
knn = KNeighborsClassifier(n_neighbors=100)  # You can adjust the number of neighbors

# Train the classifier
knn.fit(X_train, y_train)

# Make predictions
predictions = knn.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Accuracy: 0.51
Precision: 0.48
Recall: 0.32


  return self._fit(X, y)
