In [1]:
from sklearn.datasets import load_iris, fetch_california_housing
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
df = pd.read_csv('../Data/df.csv')

In [3]:
print(df.head(2))

        t_dat                                        customer_id  article_id  \
0  2018-12-17  870e1f3e107f1972e19f19365b940e0d9c9806965a58ef...   562252028   
1  2018-11-22  9f9d4926e9d48d52fd2ee2e6736d5bdfa57f72196ed6d3...   662592002   

      price  sales_channel_id club_member_status fashion_news_frequency   age  \
0  0.025407                 1             ACTIVE                   NONE  22.0   
1  0.016932                 2             ACTIVE              Regularly  34.0   

                                         postal_code  product_code  ...  \
0  07e430e18527746a3de808448ee30746851f3cf90d2a06...        562252  ...   
1  fc5b9ef2dd60d4bca8f0c5892f9fb54580b28fe2d8e7df...        662592  ...   

  department_name  index_code  index_name index_group_no  index_group_name  \
0         Trouser           A  Ladieswear              1        Ladieswear   
1         Basic 1           D     Divided              2           Divided   

  section_no                section_name garment_group

In [4]:
df.columns

Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'club_member_status', 'fashion_news_frequency', 'age', 'postal_code',
       'product_code', 'prod_name', 'product_type_no', 'product_type_name',
       'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc'],
      dtype='object')

In [7]:
df.drop(['club_member_status', 'fashion_news_frequency', 
       'product_group_name', 'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'index_group_no', 'index_group_name', 
       'garment_group_no', 'garment_group_name'], axis=1, inplace=True)

In [8]:
df.duplicated().sum()   

165

In [9]:
df.drop_duplicates(inplace=True)

In [12]:
df_clean = df.to_csv('../Data/df_clean.csv', index=False)

#### Checking for anomalies

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199835 entries, 0 to 199999
Data columns (total 22 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   t_dat                      199835 non-null  object 
 1   customer_id                199835 non-null  object 
 2   article_id                 199835 non-null  int64  
 3   price                      199835 non-null  float64
 4   sales_channel_id           199835 non-null  int64  
 5   age                        199835 non-null  float64
 6   postal_code                199835 non-null  object 
 7   product_code               199835 non-null  int64  
 8   prod_name                  199835 non-null  object 
 9   product_type_no            199835 non-null  int64  
 10  product_type_name          199835 non-null  object 
 11  graphical_appearance_no    199835 non-null  int64  
 12  graphical_appearance_name  199835 non-null  object 
 13  colour_group_code          199835 

#### Quick EDA

In [14]:
#your code here
features = df.select_dtypes(include='number')
features

Unnamed: 0,article_id,price,sales_channel_id,age,product_code,product_type_no,graphical_appearance_no,colour_group_code,department_no,section_no
0,562252028,0.025407,1,22.0,562252,272,1010016,9,1722,15
1,662592002,0.016932,2,34.0,662592,254,1010016,9,1643,51
2,607031009,0.011847,2,21.0,607031,59,1010016,17,4242,60
3,800590003,0.076254,2,24.0,800590,245,1010016,31,1648,18
4,843685003,0.016932,1,33.0,843685,255,1010010,7,1676,16
...,...,...,...,...,...,...,...,...,...,...
199995,589017001,0.038593,2,20.0,589017,306,1010016,12,3710,61
199996,695071003,0.033881,2,37.0,695071,297,1010001,41,3709,62
199997,559637003,0.010153,2,31.0,559637,59,1010017,9,4242,60
199998,688558001,0.016542,2,32.0,688558,254,1010016,9,1643,51


In [15]:
for item in features.columns:
    features[item] = pd.to_numeric(features[item], errors='coerce')


#### Train Test Split

In [16]:
df.columns

Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'age', 'postal_code', 'product_code', 'prod_name', 'product_type_no',
       'product_type_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'section_no', 'section_name', 'detail_desc'],
      dtype='object')

In [17]:
target = df["customer_id"]

In [18]:
features

Unnamed: 0,article_id,price,sales_channel_id,age,product_code,product_type_no,graphical_appearance_no,colour_group_code,department_no,section_no
0,562252028,0.025407,1,22.0,562252,272,1010016,9,1722,15
1,662592002,0.016932,2,34.0,662592,254,1010016,9,1643,51
2,607031009,0.011847,2,21.0,607031,59,1010016,17,4242,60
3,800590003,0.076254,2,24.0,800590,245,1010016,31,1648,18
4,843685003,0.016932,1,33.0,843685,255,1010010,7,1676,16
...,...,...,...,...,...,...,...,...,...,...
199995,589017001,0.038593,2,20.0,589017,306,1010016,12,3710,61
199996,695071003,0.033881,2,37.0,695071,297,1010001,41,3709,62
199997,559637003,0.010153,2,31.0,559637,59,1010017,9,4242,60
199998,688558001,0.016542,2,32.0,688558,254,1010016,9,1643,51


In [19]:
target

0         870e1f3e107f1972e19f19365b940e0d9c9806965a58ef...
1         9f9d4926e9d48d52fd2ee2e6736d5bdfa57f72196ed6d3...
2         57d76a4cd7a0f33d0a4646ef3bc9b44812d4a93bea73db...
3         090a071bc5788e8ecb31e4c6851f4b1022557e216e377d...
4         985792c784c61c29985b52f18ca559ed19db4bab94c7ca...
                                ...                        
199995    cc7e545e2416ba987a54723e88e452280675eeae33412b...
199996    5892fbb896678b351b8a9b5c33e9aac69df2407bb8dc10...
199997    8e4114966b50e2196367bd0deb9efc8b57fafb1ff697fb...
199998    0d7e8d324e428e93854bda515c629542ba4f45a0ffc0f1...
199999    b01b6c359482d00c7e0ed9af1d54af3d061ef8c23fd233...
Name: customer_id, Length: 199835, dtype: object

Now we perform the division between Train and Test, we will reserve 20% of our data to Test.

In [20]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [21]:
from sklearn.neighbors import KNeighborsRegressor

In [22]:
knn = KNeighborsRegressor(n_neighbors=10)

In [23]:
knn.fit(X_train, y_train)

#####  We are going to evaluate our model performance with R-Squared

In [24]:
knn.score(X_test, y_test)

TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [None]:
y_predict= knn.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse_standardized = mean_squared_error(y_test, y_predict)
mae_standardized = mean_absolute_error(y_test, y_predict)
r2_standardized = r2_score(y_test, y_predict)

mse_standardized, mae_standardized, r2_standardized