In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris # load_data
# import utils

In [2]:
# setting the seed to control the randomness
np.random.seed(40)

In [3]:
try :
    # get data
    iris = load_iris() # checking the data
    X = iris.data
    y = iris.target
except Exception as e :
    print("Unable to download training & test CSV. Error: %s", e)

In [4]:
def get_data(path) :
    """
    Helper to extract data from a csv file
    
    Args: 
        path to the csv file
    
    Returns:
        pd.DataFrame
    """
    return pd.read_csv(path, sep=",", header='infer')


def write_data(path, df) : 
    """
    Helper which writes a dataframe to csv
    
    Args: 
        Path: path to the csv file
        pd.DataFrame: dataframe to save as a csv file
    
    Returns:
        CSV file
    """
    return df.to_csv(path, sep=",", index=False, header=True)


def numpy_array_to_pandas_df(array) :
    """
    An helper function which transform np.ndarray into pd.dataframe
    
    Args: 
        np.ndarray: any numpy.ndarray
    
    Returns:
        pd.DataFrame: the transformed array into a dataframe
    """
    return pd.DataFrame(array, columns = ['sepal_l','sepal_w','petal_l', 'petal_w'])


def numpy_array_to_pandas_df_(array) :
    """
    An helper function which transform np.ndarray into pd.dataframe
    
    Args: 
        np.ndarray: any numpy.ndarray
    
    Returns:
        pd.DataFrame: the transformed array into a dataframe
    """
    return pd.DataFrame(array, columns = ['target'])


def drop_column_with_nan(df) :
    """
    An helper function which drop columns with a too large amount of nan values
    
    Args: 
        pd.DataFrame: any dataset
    
    Returns:
        pd.DataFrame: the same dataset stripped of these columns
    """

    mask = df.isnull().any(axis=0) # a columns list with missing data
    columns_with_nan  = df.columns[mask]
    for column in columns_with_nan :
        if df[column].isnull().sum() / df.shape[0] > 0.60:
            df.drop(column, 1, inplace=True)

    return df

In [5]:
def data_prep(X, y) :
    """
    Function to preprocess data
    
    Args: 
        pd.DataFrame of train raw and test raw
    
    Returns:
        pd.DataFrame of train and test pre processed
    """  

    # Splitting the data into training and test sets. (0.80, 0.20) split.
    # random_state is a random number generator, to get the same splits, run after run
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=14)

    y_train = y_train.reshape(y_train.shape[0], 1)
    y_test = y_test.reshape(y_test.shape[0], 1)

    print('Train set:', X_train.shape, y_train.shape)
    print('Test set:', X_test.shape, y_test.shape)


    ############################ TRANSFORM NP.NDARRAY INTO PD.DATAFRAME #########################

    X_train = numpy_array_to_pandas_df(X_train)
    y_train = numpy_array_to_pandas_df_(y_train)
    X_test = numpy_array_to_pandas_df(X_test)
    y_test = numpy_array_to_pandas_df_(y_test)


    ############################ DROP COLUMNS CONTAINING TOO MUCH NAN VALUES #########################

    X_train = drop_column_with_nan(X_train)
    y_train = drop_column_with_nan(y_train)
    X_test = drop_column_with_nan(X_test)
    y_test = drop_column_with_nan(y_test)


    ################## RECORDING OF THE NEW DATAFRAMES IN A HARD COPY ##################

    X_TRAIN_PATH = r'..\\data\\X_train.csv'
    write_data(X_TRAIN_PATH, X_train)

    Y_TRAIN_PATH = r'..\\data\\y_train.csv'
    write_data(Y_TRAIN_PATH, y_train)

    X_TEST_PATH = r'..\\data\\X_test.csv'
    write_data(X_TEST_PATH, X_test)

    Y_TEST_PATH = r'..\\data\\y_test.csv'
    write_data(Y_TEST_PATH, y_test)


    return X_train, y_train, X_test, y_test

In [6]:
# if __name__ == "__main__" :
X_train, y_train, X_test, y_test = data_prep(X, y)

Train set: (120, 4) (120, 1)
Test set: (30, 4) (30, 1)


In [7]:
type(X_train)

pandas.core.frame.DataFrame

In [27]:
# X_train = X_train.to_numpy()

In [28]:
# type(X_train)

In [29]:
# import os
# # Path
# path = "C:\\Users\\arthu\\Efrei\\M2\\APPLICATIONS_OF_BIG_DATA_2\\projet\\data\\"
# # Path of Start directory
# start = "C:\\Users\\arthu\\Efrei\\M2\\APPLICATIONS_OF_BIG_DATA_2\\projet\\"
# # Compute the relative file path
# # to the given path from the 
# # the given start directory.
# relative_path = os.path.relpath(path, start)
# # Print the relative file path
# # to the given path from the 
# # the given start directory.
# print(relative_path)

In [8]:
from pathlib import Path
relative_path = Path("C:\\Users\\arthu\\Efrei\\M2\\APPLICATIONS_OF_BIG_DATA_2\\projet\\data\\").resolve()
str(relative_path)

'C:\\Users\\arthu\\Efrei\\M2\\APPLICATIONS_OF_BIG_DATA_2\\projet\\data'

In [11]:
X_TRAIN_PATH = str(relative_path) + r'\\X_train.csv'
write_data(X_TRAIN_PATH, X_train)

In [10]:
print(X_TRAIN_PATH)

C:\Users\arthu\Efrei\M2\APPLICATIONS_OF_BIG_DATA_2\projet\data\\X_train.csv


In [12]:
Y_TRAIN_PATH = r'..\\data\\y_train.csv'

In [13]:
X_train = get_data(X_TRAIN_PATH)
print(type(X_train))

<class 'pandas.core.frame.DataFrame'>


In [14]:
X_train = get_data(X_TRAIN_PATH).to_numpy()
print(type(X_train))

<class 'numpy.ndarray'>


In [15]:
X_TEST_PATH = r'../data/X_test.csv'

In [16]:
X_test_ = get_data(X_TEST_PATH)
print(type(X_test_))

<class 'pandas.core.frame.DataFrame'>


In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, accuracy_score
import joblib
import os
# import files.utils as utils


def select_best_n_neighbors(X_train, X_test, y_train, y_test, model, n_neighbors) :
    # Instantiating and fitting the model
    model = KNeighborsClassifier(n_neighbors)
    model.fit(X_train, y_train)
    # get predictions based on X_test set
    y_pred = model.predict(X_test)

    # get metrics
    best_f1score = f1_score(y_test, y_pred, average='macro')
    best_acc = accuracy_score(y_test, y_pred) # best_acc = model.score(X_test, y_test)

    # Find the optimistic solutions on K-values
    k_range = range(1, 30)
    scores_list = []

    for k in k_range :
        model = KNeighborsClassifier(n_neighbors=k)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        f1score = f1_score(y_test, y_pred, average='macro')
        accuracy = accuracy_score(y_test, y_pred) # accuracy = model.score(X_test, y_test)
        scores_list.append(accuracy)

        # Finding the best k-value best on best accuracy value
        if best_acc < accuracy :
            n_neighbors = k
            best_f1score = f1score
            best_acc = accuracy
            best_model = model
    max_ = max(scores_list) # best_acc

    return n_neighbors, best_f1score, best_acc, best_model, scores_list


def train_model(X_TRAIN_PATH, Y_TRAIN_PATH, X_TEST_PATH, Y_TEST_PATH) :

    try :
        # get data
        X_train = get_data(X_TRAIN_PATH).to_numpy() # from pd.DataFrame to np.ndarray
        y_train = get_data(Y_TRAIN_PATH).to_numpy()
        X_test = get_data(X_TEST_PATH).to_numpy()
        y_test = get_data(Y_TEST_PATH).to_numpy()
    except Exception as e :
        print("Unable to download training & test CSV. Error: %s", e)

    # imputation
    imputer = SimpleImputer(strategy='median')
    imputer.fit(X_train) 
    X_train = imputer.transform(X_train)
    X_test = imputer.transform(X_test)
    # saving fitted imputer
    if not os.path.exists("../files/data") :
        os.makedirs("../files/data")
    joblib.dump(imputer, "../files/data/imputer.save") 

    # scaling
    scaler = StandardScaler()
    scaler.fit(X_test)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    # saving fitted scaler
    joblib.dump(scaler, "../files/data/std_scaler.save")

    model = KNeighborsClassifier(n_neighbors=1)
    n_neighbors, best_f1score, best_acc, best_model, scores_list = select_best_n_neighbors(X_train, X_test, y_train, y_test, model, n_neighbors=1)

    print('f1_score: ', best_f1score, 'accuracy_score: ', best_acc)

    joblib.dump(best_model, "../files/data/best_model.save") 

    return n_neighbors, best_f1score, best_acc, best_model, scores_list

In [18]:
X_TRAIN_PATH = r'..\\data\\X_train.csv'
Y_TRAIN_PATH = r'..\\data\\y_train.csv'
X_TEST_PATH = r'..\\data\\X_test.csv'
Y_TEST_PATH = r'..\\data\\y_test.csv'

# type(train_model(X_TRAIN_PATH, Y_TRAIN_PATH, X_TEST_PATH, Y_TEST_PATH)) # tuple

print("model : ", train_model(X_TRAIN_PATH, Y_TRAIN_PATH, X_TEST_PATH, Y_TEST_PATH)[3])

f1_score:  0.9568151147098516 accuracy_score:  0.9666666666666667
model :  KNeighborsClassifier(n_neighbors=2)


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


In [19]:
def predict_(data: pd.DataFrame) :
    """
    Helper to make predictions on data
    
    Args: 
        pd.DataFrame of given features to perform prediction on
    
    Returns:
        np.array with model predictions
    """

    # Load model as a PyFuncModel
    loaded_model = joblib.load('../files/data/best_model.save')

    # load necessary trained artefacts
    imputer = joblib.load('../files/data/imputer.save') 
    scaler = joblib.load('../files/data/std_scaler.save') 
    data = imputer.transform(data)
    data = scaler.transform(data)

    return loaded_model.predict(data)


def predict_model(X_TEST_PATH) :

    Y_TEST_PATH = r'../data/y_test.csv'

    try :
        # get data
        X_test_ = get_data(X_TEST_PATH)
        X_test = get_data(X_TEST_PATH).to_numpy()
        y_test = get_data(Y_TEST_PATH).to_numpy()
    except Exception as e :
        print("Unable to download training & test CSV. Error: %s", e)

    y_pred = predict_(X_test) # y_pred = loaded_model.predict(X_test)
    y_pred = y_pred.reshape(y_pred.shape[0], 1)

    # Create the dataframe from numpy.ndarray
    X_test_df = pd.DataFrame(X_test, columns=list(X_test_.columns))
    y_test_df = pd.DataFrame(y_test, columns=['target'])
    y_pred_df = pd.DataFrame(y_pred, columns=['target_pred'])

    # Add y_pred column to the X_test dataset
    test_df = X_test_df.join(y_test_df).join(y_pred_df)

    PATH = r'../data/final_test_df.csv'
    write_data(PATH, test_df)

    return y_pred

In [20]:
print(X.shape)
print(X[:, 1])
print(type(X[:, 1]))

(150, 4)
[3.5 3.  3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 3.7 3.4 3.  3.  4.  4.4 3.9 3.5
 3.8 3.8 3.4 3.7 3.6 3.3 3.4 3.  3.4 3.5 3.4 3.2 3.1 3.4 4.1 4.2 3.1 3.2
 3.5 3.6 3.  3.4 3.5 2.3 3.2 3.5 3.8 3.  3.8 3.2 3.7 3.3 3.2 3.2 3.1 2.3
 2.8 2.8 3.3 2.4 2.9 2.7 2.  3.  2.2 2.9 2.9 3.1 3.  2.7 2.2 2.5 3.2 2.8
 2.5 2.8 2.9 3.  2.8 3.  2.9 2.6 2.4 2.4 2.7 2.7 3.  3.4 3.1 2.3 3.  2.5
 2.6 3.  2.6 2.3 2.7 3.  2.9 2.9 2.5 2.8 3.3 2.7 3.  2.9 3.  3.  2.5 2.9
 2.5 3.6 3.2 2.7 3.  2.5 2.8 3.2 3.  3.8 2.6 2.2 3.2 2.8 2.8 2.7 3.3 3.2
 2.8 3.  2.8 3.  2.8 3.8 2.8 2.8 2.6 3.  3.4 3.1 3.  3.1 3.1 3.1 2.7 3.2
 3.3 3.  2.5 3.  3.4 3. ]
<class 'numpy.ndarray'>


In [23]:
X_TEST_PATH = r'..\\data\\X_test.csv'
Y_TEST_PATH = r'..\\data\\y_test.csv'

model = train_model(X_TRAIN_PATH, Y_TRAIN_PATH, X_TEST_PATH, Y_TEST_PATH)[3] # best_model

prediction = predict_model(X_TEST_PATH)

print(type(prediction))
print(prediction.shape)

f1_score:  0.9568151147098516 accuracy_score:  0.9666666666666667
<class 'numpy.ndarray'>
(30, 1)


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


In [176]:
np.argmax(prediction, axis=1)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [177]:
iris_type = {
    0: 'setosa',
    1: 'versicolor',
    2: 'virginica'
}

In [181]:
print(iris_type[np.argmax(prediction, axis=1)[0]])
print(round(max(prediction[0]), 2))

setosa
0


In [69]:
from sklearn.linear_model import LogisticRegression
model_ = LogisticRegression(solver='lbfgs',
                                  max_iter=1000,
                                  multi_class='multinomial').fit(X, y)

# X = [iris.sepal_l, iris.sepal_w, iris.petal_l, iris.petal_w]

# prediction_ = model_.predict_proba([X])

In [70]:
predictions = model_.predict_proba(X)

In [75]:
print(type(predictions))
print(predictions.shape)

<class 'numpy.ndarray'>
(150, 3)


In [82]:
predictions[0]

array([9.81586270e-01, 1.84137160e-02, 1.44952498e-08])

In [100]:
# k = 0
# for i in range(0, predictions.shape[0]) :
#     print(round(max(predictions[i]), 2))
#     k += 1 
# print(k)

In [90]:
predictions

array([[9.81586270e-01, 1.84137160e-02, 1.44952498e-08],
       [9.71342443e-01, 2.86575273e-02, 3.01759334e-08],
       [9.85276921e-01, 1.47230670e-02, 1.23344424e-08],
       [9.76068908e-01, 2.39310522e-02, 3.96885764e-08],
       [9.85235946e-01, 1.47640416e-02, 1.20022712e-08],
       [9.70222909e-01, 2.97770170e-02, 7.39945370e-08],
       [9.86773895e-01, 1.32260855e-02, 1.99811002e-08],
       [9.76152545e-01, 2.38474272e-02, 2.77242433e-08],
       [9.79630763e-01, 2.03692060e-02, 3.05914361e-08],
       [9.68775128e-01, 3.12248407e-02, 3.17113754e-08],
       [9.76235893e-01, 2.37640872e-02, 1.93666164e-08],
       [9.75215057e-01, 2.47848994e-02, 4.39453721e-08],
       [9.74237245e-01, 2.57627331e-02, 2.15052010e-08],
       [9.91874058e-01, 8.12593805e-03, 3.88890537e-09],
       [9.88016551e-01, 1.19834461e-02, 2.84238572e-09],
       [9.86651171e-01, 1.33488165e-02, 1.29388806e-08],
       [9.87965448e-01, 1.20345423e-02, 9.26304311e-09],
       [9.81336767e-01, 1.86632

In [116]:
# lst = list()
# for i in range(0, predictions.shape[0]) :
#     lst.append(round(max(predictions[i]), 2))

# # converting list to array
# arr = np.array(lst)

# print(len(arr))
# print(type(arr))
# print(arr)

150
<class 'numpy.ndarray'>
[0.98 0.97 0.99 0.98 0.99 0.97 0.99 0.98 0.98 0.97 0.98 0.98 0.97 0.99
 0.99 0.99 0.99 0.98 0.96 0.98 0.95 0.98 1.   0.95 0.95 0.95 0.97 0.97
 0.98 0.97 0.96 0.96 0.99 0.99 0.97 0.98 0.98 0.99 0.99 0.97 0.99 0.96
 0.99 0.97 0.96 0.97 0.98 0.98 0.98 0.98 0.87 0.86 0.73 0.94 0.82 0.86
 0.72 0.85 0.9  0.91 0.94 0.9  0.98 0.78 0.92 0.93 0.77 0.97 0.8  0.96
 0.56 0.96 0.6  0.86 0.94 0.92 0.8  0.52 0.81 0.93 0.96 0.96 0.96 0.65
 0.75 0.79 0.81 0.91 0.93 0.94 0.9  0.83 0.96 0.88 0.92 0.94 0.93 0.94
 0.76 0.94 1.   0.84 0.97 0.92 0.98 1.   0.51 0.98 0.95 0.99 0.79 0.86
 0.93 0.85 0.96 0.95 0.88 1.   1.   0.55 0.98 0.81 1.   0.61 0.96 0.95
 0.54 0.61 0.96 0.86 0.97 0.98 0.97 0.52 0.81 0.99 0.98 0.88 0.56 0.91
 0.98 0.88 0.84 0.99 0.99 0.92 0.75 0.84 0.96 0.76]


In [117]:
arr = np.array([])
for i in range(0, predictions.shape[0]) :
    # Add/append an element at the end of a numpy array
    arr = np.append(arr, round(max(predictions[i]), 2))

print(len(arr))
print(type(arr))
print(arr)

150
<class 'numpy.ndarray'>
[0.98 0.97 0.99 0.98 0.99 0.97 0.99 0.98 0.98 0.97 0.98 0.98 0.97 0.99
 0.99 0.99 0.99 0.98 0.96 0.98 0.95 0.98 1.   0.95 0.95 0.95 0.97 0.97
 0.98 0.97 0.96 0.96 0.99 0.99 0.97 0.98 0.98 0.99 0.99 0.97 0.99 0.96
 0.99 0.97 0.96 0.97 0.98 0.98 0.98 0.98 0.87 0.86 0.73 0.94 0.82 0.86
 0.72 0.85 0.9  0.91 0.94 0.9  0.98 0.78 0.92 0.93 0.77 0.97 0.8  0.96
 0.56 0.96 0.6  0.86 0.94 0.92 0.8  0.52 0.81 0.93 0.96 0.96 0.96 0.65
 0.75 0.79 0.81 0.91 0.93 0.94 0.9  0.83 0.96 0.88 0.92 0.94 0.93 0.94
 0.76 0.94 1.   0.84 0.97 0.92 0.98 1.   0.51 0.98 0.95 0.99 0.79 0.86
 0.93 0.85 0.96 0.95 0.88 1.   1.   0.55 0.98 0.81 1.   0.61 0.96 0.95
 0.54 0.61 0.96 0.86 0.97 0.98 0.97 0.52 0.81 0.99 0.98 0.88 0.56 0.91
 0.98 0.88 0.84 0.99 0.99 0.92 0.75 0.84 0.96 0.76]


In [143]:
iris_type = {
    0: 'setosa',
    1: 'versicolor',
    2: 'virginica'
}

In [145]:
print(len(np.argmax(predictions, axis=1)))
print(type(np.argmax(predictions, axis=1)))
print(np.argmax(predictions, axis=1))

150
<class 'numpy.ndarray'>
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [135]:
print(iris_type[np.argmax(predictions, axis=1)[149]])
print(arr[149])

virginica
0.76


In [161]:
lst = list()

for i in range(0, predictions.shape[0]) :
    dic = {}
    dic['class'] = iris_type[np.argmax(predictions, axis=1)[i]]
    dic['probability'] = round(max(predictions[i]), 2)
    lst.append(dic)

lst

[{'class': 'setosa', 'probability': 0.98},
 {'class': 'setosa', 'probability': 0.97},
 {'class': 'setosa', 'probability': 0.99},
 {'class': 'setosa', 'probability': 0.98},
 {'class': 'setosa', 'probability': 0.99},
 {'class': 'setosa', 'probability': 0.97},
 {'class': 'setosa', 'probability': 0.99},
 {'class': 'setosa', 'probability': 0.98},
 {'class': 'setosa', 'probability': 0.98},
 {'class': 'setosa', 'probability': 0.97},
 {'class': 'setosa', 'probability': 0.98},
 {'class': 'setosa', 'probability': 0.98},
 {'class': 'setosa', 'probability': 0.97},
 {'class': 'setosa', 'probability': 0.99},
 {'class': 'setosa', 'probability': 0.99},
 {'class': 'setosa', 'probability': 0.99},
 {'class': 'setosa', 'probability': 0.99},
 {'class': 'setosa', 'probability': 0.98},
 {'class': 'setosa', 'probability': 0.96},
 {'class': 'setosa', 'probability': 0.98},
 {'class': 'setosa', 'probability': 0.95},
 {'class': 'setosa', 'probability': 0.98},
 {'class': 'setosa', 'probability': 1.0},
 {'class': '

In [174]:
X = np.array([5, 2, 3, 4]).reshape(1, -1) #.reshape(1, 4)

# prediction = predict_model(X) # self.model.predict_proba([X])
predictions = model_.predict_proba(X)

dic = {'class': iris_type[np.argmax(predictions, axis=1)[0]],
        'probability': round(max(predictions[0]), 2)}

TypeError: unhashable type: 'numpy.ndarray'

In [173]:
dic

{'class': 'virginica', 'probability': 0.91}