In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv('AggregateDataLargeLabelled.csv').dropna()

In [None]:
relevent_data = df.drop('OpenAI', axis=1).drop('Azure', axis=1).drop('Caption', axis=1)
relevent_data['Label'] = relevent_data['Label'].apply(lambda x: 0 if x == 1 else 1)
relevent_data.head()

Unnamed: 0,Label,OpenAI_Conf,OpenAI_Azure_Conf,Azure_Conf,Animal,Building,Indoor,Outdoor,People
0,1,0.2961,0.00021,0.3386,0.0,1.0,0.0,0.0,0.0
1,1,0.2954,2.7e-05,0.338996,0.0,1.0,0.0,0.0,0.0
2,0,0.1941,0.000589,0.995034,0.0,0.0,0.0,1.0,0.0
3,1,0.3076,0.11676,0.211877,0.0,1.0,0.0,0.0,0.0
5,0,0.2053,0.007248,0.500465,0.0,1.0,0.0,0.0,0.0


In [None]:
print(len(relevent_data.loc[(relevent_data['Label'] == 1) & (relevent_data['Animal'] == 1)])/len(relevent_data.loc[relevent_data['Label'] == 1]))

0.012121212121212121


In [None]:
"""
Stats about our dataset
"""

# 0s vs 1s
print("0s vs 1s")
print(relevent_data['Label'].value_counts())
print("\n")

# Average confidence of OpenAI when 0
print("Average confidence of OpenAI when 0")
print(sum(relevent_data.loc[relevent_data['Label'] == 0]['OpenAI_Conf'])/len(relevent_data.loc[relevent_data['Label'] == 0]))
print("\n")

# Average confidence of Azure when 1
print("Average confidence of Azure when 1")
print(sum(relevent_data.loc[relevent_data['Label'] == 1]['Azure_Conf'])/len(relevent_data.loc[relevent_data['Label'] == 0]))
print("\n")

# Percentage of 1 when animal is when 1
print("Percentage of 1 when animal is when 1")
print(len(relevent_data.loc[(relevent_data['Label'] == 1) & (relevent_data['Animal'] == 1)])/len(relevent_data.loc[(relevent_data['Label'] == 1)]))
print("\n")

# Percentage of 1 when building is when 1
print("Percentage of 1 when building is when 1")
print(len(relevent_data.loc[(relevent_data['Label'] == 1) & (relevent_data['Building'] == 1)])/len(relevent_data.loc[(relevent_data['Label'] == 1)]))
print("\n")

# Percentage of 1 when indoor is when 1
print("Percentage of 1 when indoor is when 1")
print(len(relevent_data.loc[(relevent_data['Label'] == 1) & (relevent_data['Indoor'] == 1)])/len(relevent_data.loc[(relevent_data['Label'] == 1)]))
print("\n")

# Percentage of 1 when outdoor is when 1
print("Percentage of 1 when outdoor is when 1")
print(len(relevent_data.loc[(relevent_data['Label'] == 1) & (relevent_data['Outdoor'] == 1)])/len(relevent_data.loc[(relevent_data['Label'] == 1)]))
print("\n")

# Percentage of 1 when people is when 1
print("Percentage of 1 when people is when 1")
print(len(relevent_data.loc[(relevent_data['Label'] == 1) & (relevent_data['People'] == 1)])/len(relevent_data.loc[(relevent_data['Label'] == 1)]))
print("\n")


0s vs 1s
0    336
1    165
Name: Label, dtype: int64


Average confidence of OpenAI when 0
0.4896425595238093


Average confidence of Azure when 1
0.21398327290922617


Percentage of 1 when animal is when 1
0.012121212121212121


Percentage of 1 when building is when 1
0.12727272727272726


Percentage of 1 when indoor is when 1
0.0


Percentage of 1 when outdoor is when 1
0.4121212121212121


Percentage of 1 when people is when 1
0.18181818181818182




In [None]:
relevent_data = relevent_data.to_numpy()

In [None]:
y = relevent_data[:, 0]
X = relevent_data[:, 1:]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

base_estimator = RandomForestClassifier(random_state=0)
parameters = {
  'n_estimators':[10,20,30,50,100,150,200,250,500,1000], 
  'max_depth':[1, 10],
  'max_depth': [3, 5, 10, 25, 50],
  'min_samples_split': [2, 5, 10, 25, 50]
}
search = GridSearchCV(base_estimator, parameters, cv=5)
search.fit(X, y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

In [None]:
model = search.best_estimator_

In [None]:
model.score(X_test, y_test)

0.7058823529411765

In [None]:
zeros = [1 if d == 0 else 0 for d in y_test]

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

array([1., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
X_test

array([[2.55900000e-01, 1.32700000e-01, 4.02824193e-01, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [3.06400000e-01, 3.06400000e-01, 5.22666872e-01, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [7.45000000e-01, 2.40000000e-05, 3.30100030e-01, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [5.79000000e-01, 3.13400000e-03, 3.75661731e-01, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.87400000e-01, 6.13000000e-03, 2.12117642e-01, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [7.18800000e-01, 1.82800000e-02, 4.64414895e-01, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.62700000e-01, 2.51000000e-04, 5.38805127e-01, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.

In [None]:
d = {'OpenAI': [], 'Azure': [], 'Caption': [], 'Label':[], 'Pred':[]}
pref_df = pd.DataFrame(data=d)

for row, y in zip(X_test,y_pred):
  dataframe_row = df.loc[(df['OpenAI_Conf'] == row[0]) & (df['OpenAI_Azure_Conf'] == row[1]) & (df['Azure_Conf'] == row[2])]
  df2 = pd.DataFrame(
      [[dataframe_row['OpenAI'], dataframe_row['Azure'], dataframe_row['Caption'], dataframe_row['Label'], y]], 
      columns=['OpenAI', 'Azure', 'Caption', 'Label', 'Pred']
  )
  pref_df = pref_df.append(df2)

In [None]:
pref_df.head()

Unnamed: 0,OpenAI,Azure,Caption,Label,Pred
0,173 [84] Its approval by Congress became a ...,"173 a man speaking to a man Name: Azure, dt...","173 Following the 1966 smog, air-pollution...","173 1.0 Name: Label, dtype: float64",1.0
0,274 Kalki Koechlin smiling for the camera N...,274 Kalki Koechlin smiling for the camera N...,"274 Koechlin's performance was acclaimed, ...","274 2.0 Name: Label, dtype: float64",1.0
0,492 [77] It is estimated that a third of al...,"492 a snake in the dirt Name: Azure, dtype:...","492 Polyporus squamosus Name: Caption, dtyp...","492 1.0 Name: Label, dtype: float64",0.0
0,72 Mill Creek continues east through the Me...,72 a river with trees and grass around it N...,"72 Mill Creek near Junction Name: Caption, ...","72 1.0 Name: Label, dtype: float64",0.0
0,453 Khenut and Nebet were buried in a doubl...,"453 a large stone structure Name: Azure, dt...",453 Remains of the outer casing on Unas's p...,"453 1.0 Name: Label, dtype: float64",0.0


In [None]:
df.to_csv('predictions.csv')

In [None]:
model.predict([[0.3577, 0.01926, 0.432734876871109, 0,0,0,0,1]])

array([1.])

In [None]:
pickle.dump(model, open('finalized_model.sav', 'wb'))