In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.decomposition import PCA
import seaborn as sns
import math
import statistics
import plotly.express as exp

In [63]:
data = pd.read_csv('adamic3.csv')
data

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,Adamic_Adar_index
0,0,-114.31,34.19,15,5612,1283,1015,472,1.4936,,0.0
1,1,-114.47,34.4,19,7650,1901,1129,463,1.82,80100.0,1.174879
2,2,-114.56,33.69,17,720,174,333,117,1.6509,85700.0,1.236472
3,3,-114.57,33.64,14,1501,337,515,226,3.1917,23000.0,1.885382
4,4,-114.57,33.57,20,1454,326,624,262,1.925,65500.0,1.220495
5,5,-114.58,33.63,29,1387,236,671,239,3.3438,74000.0,1.165225
6,6,-114.58,33.61,25,2907,680,1841,633,2.6768,82400.0,1.215647
7,7,-114.59,34.83,41,812,168,375,158,1.7083,48500.0,1.476733
8,8,-114.59,33.61,34,4789,1175,3134,1056,2.1782,2000.0,2.722153
9,9,-114.6,34.83,46,1497,309,787,271,2.1908,48100.0,1.476733


In [64]:
data=data.drop('Unnamed: 0',axis=1)
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,Adamic_Adar_index
0,-114.31,34.19,15,5612,1283,1015,472,1.4936,,0.0
1,-114.47,34.4,19,7650,1901,1129,463,1.82,80100.0,1.174879
2,-114.56,33.69,17,720,174,333,117,1.6509,85700.0,1.236472
3,-114.57,33.64,14,1501,337,515,226,3.1917,23000.0,1.885382
4,-114.57,33.57,20,1454,326,624,262,1.925,65500.0,1.220495
5,-114.58,33.63,29,1387,236,671,239,3.3438,74000.0,1.165225
6,-114.58,33.61,25,2907,680,1841,633,2.6768,82400.0,1.215647
7,-114.59,34.83,41,812,168,375,158,1.7083,48500.0,1.476733
8,-114.59,33.61,34,4789,1175,3134,1056,2.1782,2000.0,2.722153
9,-114.6,34.83,46,1497,309,787,271,2.1908,48100.0,1.476733


In [65]:
data=data.fillna(0)

In [66]:
data1=data[['median_income','median_house_value']].copy()
data1

Unnamed: 0,median_income,median_house_value
0,1.4936,0.0
1,1.82,80100.0
2,1.6509,85700.0
3,3.1917,23000.0
4,1.925,65500.0
5,3.3438,74000.0
6,2.6768,82400.0
7,1.7083,48500.0
8,2.1782,2000.0
9,2.1908,48100.0


In [67]:
km = KMeans(n_clusters = 4)
clusters=km.fit(data1)

In [68]:
clusters

KMeans(n_clusters=4)

In [69]:
#plt.scatter(*zip(*data1),c=clusters,marker = "x")

In [70]:
exp.scatter(data_frame= data1,x = 'median_house_value',y = 'median_income',color=km.labels_)

In [71]:
# # obtaining the centers of the clusters
# centroids = km.cluster_centers_
# # points array will be used to reach the index easy
# points = np.empty((0,len(data1['median_income'])), float)
# # distances will be used to calculate outliers
# distances = np.empty((0,len(data1['median_income'])), float)

In [72]:
def distance_from_center(income, house_value, label):
    '''
    Calculate the Euclidean distance between a data point and the center of its cluster.
:param float income: the standardized income of the data point 
    :param float age: the standardized age of the data point 
    :param int label: the label of the cluster
    :rtype: float
    :return: The resulting Euclidean distance  
    '''
    center_income =  clusters.cluster_centers_[label,0]
    center_house_value =  clusters.cluster_centers_[label,1]
    distance = np.sqrt((income - center_income) ** 2 + (house_value - center_house_value) ** 2)
    return np.round(distance, 3)

In [73]:
data1['label'] = clusters.labels_
data1['distance'] = distance_from_center(data1.median_income, data1.median_house_value, data1.label)

In [74]:
data1

Unnamed: 0,median_income,median_house_value,label,distance
0,1.4936,0.0,0,10000.0
1,1.82,80100.0,1,3833.333
2,1.6509,85700.0,1,1766.667
3,3.1917,23000.0,0,13000.0
4,1.925,65500.0,3,3757.143
5,3.3438,74000.0,1,9933.333
6,2.6768,82400.0,1,1533.333
7,1.7083,48500.0,3,13242.857
8,2.1782,2000.0,0,8000.0
9,2.1908,48100.0,3,13642.857


In [75]:
percentile = 70
# getting outliers whose distances are greater than some percentile
outliers_idx = data1.index[np.where(data1.distance > np.percentile(data1.distance, percentile))]

In [76]:
outliers_idx

Int64Index([3, 7, 9, 16, 17], dtype='int64')

In [77]:
outliers = data[data.index.isin(outliers_idx)]
# outliers

In [78]:
outliers1 = data[data.median_house_value==0]
outliers = pd.concat([outliers,outliers1]).drop_duplicates()
outliers

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,Adamic_Adar_index
3,-114.57,33.64,14,1501,337,515,226,3.1917,23000.0,1.885382
7,-114.59,34.83,41,812,168,375,158,1.7083,48500.0,1.476733
9,-114.6,34.83,46,1497,309,787,271,2.1908,48100.0,1.476733
16,-114.65,33.6,28,1678,322,666,256,2.9653,94900.0,1.339463
17,-114.65,32.79,21,44,33,64,27,0.8571,25000.0,1.870779
0,-114.31,34.19,15,5612,1283,1015,472,1.4936,0.0,0.0
18,-114.66,32.74,17,1388,386,775,320,1.2049,0.0,0.0


In [79]:
train=data
bad_df=data.index.isin(outliers.index)
train=train[~bad_df]

In [80]:
train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,Adamic_Adar_index
1,-114.47,34.4,19,7650,1901,1129,463,1.82,80100.0,1.174879
2,-114.56,33.69,17,720,174,333,117,1.6509,85700.0,1.236472
4,-114.57,33.57,20,1454,326,624,262,1.925,65500.0,1.220495
5,-114.58,33.63,29,1387,236,671,239,3.3438,74000.0,1.165225
6,-114.58,33.61,25,2907,680,1841,633,2.6768,82400.0,1.215647
8,-114.59,33.61,34,4789,1175,3134,1056,2.1782,2000.0,2.722153
10,-114.6,33.62,16,3741,801,2434,824,2.6797,86500.0,1.262251
11,-114.6,33.6,21,1988,483,1182,437,1.625,62000.0,1.194717
12,-114.61,34.84,48,1291,248,580,211,2.1571,68600.0,1.194717
13,-114.61,34.83,31,2478,464,1346,479,3.212,70400.0,1.156629


In [81]:
train.to_csv('train.csv')

In [82]:
outliers.to_csv('regenerate.csv')