In [123]:
# Author: @Roshan Bhandari
# Purpose: This code is used to develop ML algorithm to predict different classes of exponential fits and misfits.
# Exp_2date: Type-4 misfit
# Exp_fall: Type-2 misfit
# Exp_lin: Type-3 misfit
# Exp_nonfit: Type-1 misfit
# Exp_used: Actual interesting patterns (includes Type-5 misfit)

In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import inspect

def retrieve_name(var):
    callers_local_vars = inspect.currentframe().f_back.f_locals.items()
    return [var_name for var_name, var_val in callers_local_vars if var_val is var][0]

In [19]:
data = pd.read_csv('combined_csv_data.csv')
data = data[['Exp_2date', 'Exp_fall', 'Exp_lin', 'Exp_nonfit', 'Exp_used', 'analyzed_fraction', 'delta_y', 'exp_A', 'exp_B', 'exp_r_squared','logistic_k', 'logistic_r_s']]
data['class'] = None

def classify(row):
    if str(row['Exp_2date']).lower() == 'yes':
        row['class'] = 'Exp_2date'
    
    if str(row['Exp_lin']).lower() == 'yes':
        row['class'] = 'Exp_lin'
    
    if str(row['Exp_fall']).lower() == 'yes':
        row['class'] = 'Exp_fall'
    
    if str(row['Exp_nonfit']).lower() == 'yes':
        row['class'] = 'Exp_nonfit'
    
    if str(row['Exp_used']).lower() == 'yes':
        row['class'] = 'Exp_used'
        
    return row

data = data.apply(lambda row : classify(row), axis=1)
data.head()

data = data[['analyzed_fraction', 'delta_y', 'exp_A', 'exp_B', 'exp_r_squared','logistic_k', 'logistic_r_s', 'class']]

In [27]:
# Fill Na
data = data.fillna(0)

In [28]:
# Train Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    data[['analyzed_fraction', 'delta_y', 'exp_A', 'exp_B', 'exp_r_squared','logistic_k', 'logistic_r_s']], 
    data[['class']], 
    train_size=0.7, 
    test_size=0.3, 
    random_state=42
)


for v in [X_train, X_test, Y_train, Y_test]:
    print("Size of {}: {}".format(retrieve_name(v), v.shape))

Size of X_train: (1443, 7)
Size of X_test: (619, 7)
Size of Y_train: (1443, 1)
Size of Y_test: (619, 1)


In [30]:
models = {}

models['KNeighborsClassifier'] = KNeighborsClassifier()
models['RandomForestClassifier'] = RandomForestClassifier(n_estimators=20)
models['LogisticRegression'] = LogisticRegression()
# models['NaiveBayes'] = MultinomialNB()
models['SVC'] = SVC()


for model_name, model in models.items():
    model.fit(X_train, Y_train)
    target = model.predict(X_test)
    score = accuracy_score(Y_test,target)
    print('{}: {}'.format(model_name, score))

  model.fit(X_train, Y_train)
  model.fit(X_train, Y_train)
  y = column_or_1d(y, warn=True)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)


KNeighborsClassifier: 0.752827140549273
RandomForestClassifier: 0.9224555735056543
LogisticRegression: 0.014539579967689823
SVC: 0.5169628432956381


In [115]:
data2 = pd.read_csv('combined_csv_data2.csv')
data2 = data2.fillna(0)
data_to_predict = data2[['analyzed_fraction', 'delta_y', 'exp_A', 'exp_B', 'exp_r_squared','logistic_k', 'logistic_r_s']]
data_to_predict = data_to_predict.fillna(0)

In [116]:
data_to_predict.head(150)

Unnamed: 0,analyzed_fraction,delta_y,exp_A,exp_B,exp_r_squared,logistic_k,logistic_r_s
0,0.25,1.495200e+06,0.000000,0.000000e+00,0.000000,0.0,0.0
1,0.25,6.590000e+00,0.000000,0.000000e+00,0.000000,0.0,0.0
2,0.25,6.830000e+00,0.000000,0.000000e+00,0.000000,0.0,0.0
3,0.25,6.840000e+00,0.000000,0.000000e+00,0.000000,0.0,0.0
4,0.25,6.390000e+00,0.000000,0.000000e+00,0.000000,0.0,0.0
...,...,...,...,...,...,...,...
145,1.00,4.080000e+00,22.910032,3.041701e+12,0.874885,0.0,0.0
146,1.00,4.320000e+00,22.957231,3.027672e+12,0.869008,0.0,0.0
147,1.00,4.139999e+00,22.874145,3.045098e+12,0.867472,0.0,0.0
148,1.00,1.723000e+03,0.000000,0.000000e+00,0.000000,0.0,0.0


In [117]:
data_to_predict['exp_A'] = data_to_predict['exp_A'].astype(np.float32)
data_to_predict['exp_B'] = data_to_predict['exp_B'].astype(np.float32)
data_to_predict['analyzed_fraction'] = data_to_predict['analyzed_fraction'].astype(np.float32)
data_to_predict['exp_r_squared'] = data_to_predict['exp_r_squared'].astype(np.float32)
data_to_predict['logistic_k'] = data_to_predict['logistic_k'].astype(np.float32) 


def convert_to_float_32(data_frame):
    columns = ['analyzed_fraction', 'delta_y', 'exp_A', 'exp_B', 'exp_r_squared','logistic_k', 'logistic_r_s']
    for column_name in columns:
        float_32_list = []
        for eachdata in list(data_frame[column_name]):
            try:
                float_32_list.append(np.float32(eachdata))
            except:
                print(eachdata)
                float_32_list.append(0) 
        data_frame[column_name] = float_32_list
    return data_frame

data_to_predict = convert_to_float_32(data_to_predict)


In [118]:
def generate_data_to_classify(row):
    row['unused'] = True
    if row['exp_A'] or row['exp_B'] or row['exp_r_squared'] or row['logistic_k'] or row['logistic_r_s']:
        row['unused'] = False
    return row

data_to_predict['unused'] = None
data_to_predict = data_to_predict.apply(lambda row : generate_data_to_classify(row), axis=1)

In [85]:
data_to_predict[data_to_predict['unused']==False].head()

Unnamed: 0,analyzed_fraction,delta_y,exp_A,exp_B,exp_r_squared,logistic_k,logistic_r_s,unused
10,0.5,91.0,12.268949,77471510000.0,-0.888468,0.0,0.0,False
34,0.5,26.0,1961.389404,43171750000.0,0.999997,4.956132,0.988223,False
39,0.5,26.0,1961.389404,43171750000.0,0.999997,4.956132,0.988223,False
48,0.5,5.879999,25.826944,2451864000000.0,0.913951,0.0,0.0,False
49,0.5,5.620001,26.019491,2431053000000.0,0.94396,0.0,0.0,False


In [119]:
ml_data = data_to_predict[data_to_predict['unused']==False] 

In [120]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

if 'unused' in list(ml_data.columns):
    del ml_data['unused']

ml_data = clean_dataset(ml_data)

model = models['RandomForestClassifier']
target = model.predict(ml_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [122]:
print(target, len(target))

['Exp_nonfit' 'Exp_lin' 'Exp_lin' ... 'Exp_nonfit' 'Exp_nonfit'
 'Exp_nonfit'] 209666


In [102]:
class_count = {}
for each_class in target:
    if each_class not in class_count:
        class_count[each_class] = 1
    else:
        class_count[each_class] += 1

class_percentage = {}
for each_class in class_count:
    class_percentage[each_class] = class_count[each_class]/len(target)*100

In [103]:
print(class_count)
print(class_percentage)

{'Exp_nonfit': 168230, 'Exp_lin': 22070, 'Exp_fall': 288, 'Exp_used': 18715, 'Exp_2date': 363}
{'Exp_nonfit': 80.23713906880468, 'Exp_lin': 10.526265584310284, 'Exp_fall': 0.13736132706304313, 'Exp_used': 8.926101513836292, 'Exp_2date': 0.17313250598571062}


In [109]:
data3 = pd.read_csv('combined_csv_whole_data.csv')

In [110]:
data3 = data3.fillna(0)
data_to_predict = data3[['analyzed_fraction', 'delta_y', 'exp_A', 'exp_B', 'exp_r_squared','logistic_k', 'logistic_r_s']]

data_to_predict['unused'] = None
data_to_predict = data_to_predict.apply(lambda row : generate_data_to_classify(row), axis=1)

ml_data = data_to_predict[data_to_predict['unused']==False]

if 'unused' in list(ml_data.columns):
    del ml_data['unused']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_to_predict['unused'] = None


In [114]:
ml_data = convert_to_float_32(ml_data)
ml_data = clean_dataset(ml_data)

model = models['RandomForestClassifier']
target = model.predict(ml_data)

class_count = {}
for each_class in target:
    if each_class not in class_count:
        class_count[each_class] = 1
    else:
        class_count[each_class] += 1

class_percentage = {}
for each_class in class_count:
    class_percentage[each_class] = class_count[each_class]/len(target)*100

print(len(target))
print(class_count)
print(class_percentage)

209797
209797
209797
209797
209797
209797
209797
209797
{'Exp_nonfit': 168206, 'Exp_lin': 22173, 'Exp_used': 18770, 'Exp_2date': 361, 'Exp_fall': 287}
{'Exp_nonfit': 80.17559831646783, 'Exp_lin': 10.568787923564207, 'Exp_used': 8.946743757060396, 'Exp_2date': 0.17207109729881742, 'Exp_fall': 0.13679890560875513}
