# Take home assessment for Data Analyst Intern position at Vigilant

Name: Elaine Gao

In [95]:
class k_nearest:
    import pandas as pd
    import numpy as np
    import warnings
    import random

    def __init__(self, k, p, data, normalization=False):
        self.k = k
        self.norm = normalization
        self.p = p
        self.data = data
        
        
    def distance(self, p, raw_x, raw_y):
        raw = raw_x - raw_y
        if p ==  1:
            raw_ = np.absolute(raw)
        else:
            raw_ = np.power(np.absolute(raw),p)
            
        return np.sum(raw_) ** (1/p)
    
    
    def scale_average(self, data):
        mean = data.mean()
        mx = data.max()
        mi = data.min()
        
        return (data-mean)/(mx-mi)
    
    
    def scale_minmax(self, data):
        mx = data.max()
        mi = data.min()
        
        return (data-mi)/(mx-mi)
    
    
    def scale_standard(self, data):
        mean = data.mean()
        sde = np.sqrt(data.var())
        
        return (data-mean)/sde
    
    
    def get_k_nearest(self, input_data):
        
        self.data_dis = self.data.drop(self.data.columns[-1], axis=1)
        
        if not self.norm:
            self.data1 = self.data_dis.copy()
        elif self.norm == "average":
            self.data1 = self.scale_average(self.data_dis)
        elif self.norm == "minmax":
            self.data1 = self.scale_minmax(self.data_dis)
        elif self.norm == "standard":
            self.data1 = self.scale_standard(self.data_dis)
        else:
            raise Warning("Nomalization could only be {'average','minmax','standard',False}")
        
        self.result = self.data.copy()
        
        if self.p < 1:
            raise Warning("P must be positive!")
        
        if int(self.p) != self.p:
            raise Warning("P must be positive interger!")
            
        self.result["distance"] = self.data1.apply(lambda x: self.distance(p=self.p, raw_x = input_data, raw_y=x), axis=1)
        
        
        self.result = self.result.sort_values(by="distance")
        
        #if self.p == 1:
        #    print("The distance is calculated by Manhattan.")
        #elif self.p == 2:
        #    print("The distance is calculated by Euclidean.")
        #elif self.p == int(self.p):
        #    print("The distance is calculated by Minkowski and p = " + str(self.p) + ".")
        #
        #print("The k most analogous data instances is generated!")
        
        #return self.result.iloc[:self.k,:]
    
    def get_distance(self):
        return self.result.iloc[:self.k,:]
        
    def get_count_(self):
        self.output = self.result.iloc[:self.k,:]
        self.output = pd.DataFrame(self.output.iloc[:,-2].value_counts())
        self.output.columns = ["Count"]
        return self.output.sort_values(by="Count", ascending = False)
    
    def predict(self):
        self.output = self.result.iloc[:self.k,:]
        self.output = pd.DataFrame(self.output.iloc[:,-2].value_counts())
        self.output.columns = ["Count"]
        output = self.output.sort_values(by="Count", ascending = False)
        output = output.reset_index()
        mx = output.iloc[0,1]
        choice = output[output.Count == mx].iloc[:,0].tolist()
        return list(random.choice(choice))
        

## Model fitting

k_nearest classifier

### Parameters

K: interger 
> The number of neighbors that are used to generate the predition.

p: interger
> The parameter of chebyshev method for distance calculation.
            
data: DataFrame
> DataFrame_like, shape(n_samples,n_variables). Target contained

normalization: string, optional, (default=False)
> The method used to normalized the dataset. The options are {'average','minmax','standard',False}.


### Attributes

result: DataFrame_like
> The dataset with distance calculated.

### Methods:
#### get_k_nearest(input_data)
Parameter:  input_data (array_like)  
return: an instance of self.

#### get_distance()
retrun: top k analogous instances with distance calculated.

#### predict()
return: The prediction with the most rates.

In [111]:
# Generate a dataframe
data = pd.DataFrame()
data["column1"] = [5.1, 4.9, 4.7, 6.3, 6.1, 7.7, 6.8]
data["column2"] = [3.5, 3.0, 3.2, 2.8, 2.6, 3.0, 4.8]
data["column3"] = [0.2, 0.2, 0.2, 1.5, 1.4, 2.3, 1.4]
data["column4"] = ["A", "A", "A", "C", "C", "C", "B"]

# Create a target array
a = np.array([5.1, 3.5, 0.3])

# Fit the target with dataFrame
model = k_nearest(k = 6, p=2, data=data, normalization=None)
model.get_k_nearest(input_data = a )

# Get the dataframe of with the distance
model.get_distance()

# Get the count of each category
model.get_count_()

# Get the prediction
model.predict()

# Get the distance calculated
model.result

Unnamed: 0,column1,column2,column3,column4,distance
0,5.1,3.5,0.2,A,0.1
2,4.7,3.2,0.2,A,0.509902
1,4.9,3.0,0.2,A,0.547723
4,6.1,2.6,1.4,C,1.737815
3,6.3,2.8,1.5,C,1.835756
6,6.8,4.8,1.4,B,2.406242
5,7.7,3.0,2.3,C,3.318132


## Example

In [136]:
test = pd.DataFrame()
test["column1"] = [5.4, 4.9, 5.7, 6.3, 4.1, 4.7]
test["column2"] = [3.5, 3.5, 1.2, 4.8, 2.6, 3.0]
test["column3"] = [1.2, 0.2, 2.2, 2.5, 4.4, 2.3]


model = k_nearest(k = 3, p=2, data=data, normalization=None)
def generate_prediction(model,x):
    model.get_k_nearest(input_data = x)
    return model.predict()

test["prediction"] = test.apply(lambda x: generate_prediction(model, x[0:4]), axis=1)
test

Unnamed: 0,column1,column2,column3,prediction
0,5.4,3.5,1.2,[A]
1,4.9,3.5,0.2,[A]
2,5.7,1.2,2.2,[B]
3,6.3,4.8,2.5,[A]
4,4.1,2.6,4.4,[B]
5,4.7,3.0,2.3,[A]


## Task data

In [139]:
from sklearn.model_selection import train_test_split

data = pd.read_csv("/Users/gaozhiping/Desktop/analytical_cases/data.csv")
train, test = train_test_split(data, test_size=0.33)

In [148]:
model = k_nearest(k = 2, p=2, data=train, normalization=None)
def generate_prediction(model,x):
    model.get_k_nearest(input_data = x)
    return model.predict()

test["prediction"] = test.apply(lambda x: generate_prediction(model, x[0:4])[0], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [149]:
print("The precition of each group is: ")
test[test.prediction == test.Target].prediction.value_counts()/test.prediction.value_counts()

The precition of each group is: 


C    0.857143
B    1.000000
A    1.000000
Name: prediction, dtype: float64