In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler 
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error , mean_absolute_percentage_error, mean_squared_error
%matplotlib inline
df = pd.read_csv("housing.csv")
data = df.copy()

def d(df):
    print("",'\033[1m'+'ACCESING THE DATA'+'\033[0m')
    print("\n",'\033[1m'+'CSV Data'+'\033[0m')
    print(df)
    print("\n",'\033[1m'+'INFO'+'\033[0m')
    print(df.info())
    print("\n",'\033[1m'+'DESCRIBE'+'\033[0m')
    print(df.describe())
    print("\n")

def scatterplot():
    print("",'\033[1m'+'SCATTER PLOT'+'\033[0m')
    plt.figure(figsize = (8,8))
    plt.scatter(df['longitude'] , df['latitude'] , c = df['median_house_value'])
    plt.colorbar()
    plt.xlabel("longitude")
    plt.ylabel("latitude")
    plt.title("House Prices")
    plt.show()
    print('\n')

def heatmap():
    print("",'\033[1m'+'HEAT MAP'+'\033[0m')
    plt.figure(figsize = (12, 8))
    sns.heatmap(df.corr(), annot = True, fmt = '.2f', cmap = 'viridis')
    plt.show()
    print("\n")
    
def histogram():
    print("",'\033[1m'+'HISTOGRAM'+'\033[0m')
    df.hist(bins = 50 , figsize=(30 , 20),color="k")
    plt.show()
    print("\n")
    
def boxplot():
    print("",'\033[1m'+'BOX PLOT'+'\033[0m')
    num_columns = list(df.select_dtypes(include=["int64","float64"]).columns)[2:]
    fig, ax = plt.subplots(4,2, figsize = (15,15))
    font_dict = {'fontsize': 14}
    ax = np.ravel(ax)
    for i in num_columns:
        sns.boxplot(data = df, x = i, ax = ax[num_columns.index(i)], palette = "plasma").set_title(i)
    ax = np.reshape(ax, (4, 2))
    plt.tight_layout()
    plt.show()
    print("\n")
    
def m_data():
    print("",'\033[1m'+'MISSING DATA'+'\033[0m')
    print("Data shape Before duplicates Values: ",df.shape)
    df_new = df.drop_duplicates()
    print("Data shape After duplicates Values: ",df_new.shape)
    missing_values_df = df.isnull().sum()
    print(missing_values_df)
    
    #Percentage of missing data by feature
    df_na = (missing_values_df / len(df)) * 100
    
    # drop columns without missing values 
    df_na = df_na.drop(df_na[df_na == 0].index)
    
    #sort
    df_na = df_na.sort_values(ascending=False)
    print("Percentage of missing values in {} : {} %".format(df_na.index[0],df_na[0]))
    total_cells = np.product(df.shape)
    total_missing = missing_values_df.sum()
    print("percent of data that is missing from all Dataset: {}%".format(round((total_missing/total_cells) * 100,ndigits=2)))
    
    # create plot
    f, ax = plt.subplots(figsize=(9, 6))
    sns.heatmap(df.isnull())
    ax.set(title = 'heatmap of missing data by feature', xlabel = 'feature')
    plt.show()
    
    #Filling in missing values
    def Zscore_outlier(column,df,scale=3.75):
        out=[]
        m = np.mean(df[column])
        sd = np.std(df[column])
        for i in df[column]: 
            z = (i - m) / sd
            if np.abs(z) > scale: 
                out.append(i)
        df = df[df[column].isin(out) == False]
        return df
    fig, ax = plt.subplots(1,2, figsize = (12,4))
    font_dict = {'fontsize': 14}
    ax = np.ravel(ax)
    bedrooms_total = pd.DataFrame(df_new['total_bedrooms'] / df_new['total_rooms'], columns = ["bedrooms_total"])
    print("bedrooms to total ratio mean : ",bedrooms_total.mean()[0])
    sns.boxplot(data = bedrooms_total, x = 'bedrooms_total', palette = 'viridis', ax = ax[0]).set_title('bedrooms to total ratio before Outlier')
    bedrooms_total = Zscore_outlier(column = 'bedrooms_total', df = bedrooms_total)
    print("bedrooms to total ratio mean : ",bedrooms_total.mean()[0])
    sns.boxplot(data = bedrooms_total, x = 'bedrooms_total', palette = 'cividis', ax = ax[1]).set_title('bedrooms to total ratio after drop Outlier')
    ax = np.reshape(ax, (1, 2))
    plt.tight_layout()
    plt.show()
    bedrooms_total=bedrooms_total.mean()[0]
    df_new['total_bedrooms']=df_new['total_bedrooms'].fillna(bedrooms_total*df_new['total_rooms'])
    print(df_new.isnull().sum())
    print("\n")
    
def dt(df):
    fig, ax = plt.subplots(2,2, figsize = (12,8))
    font_dict = {'fontsize': 14}
    ax = np.ravel(ax)
    sns.kdeplot(data = df, x = 'total_bedrooms', ax = ax[0], palette = 'viridis').set_title('After Distribution')
    sns.kdeplot(data = df, x = 'total_bedrooms', ax = ax[1], palette = 'cividis').set_title('Before Distribution')
    sns.boxplot(data = df, x = 'total_bedrooms', ax = ax[2], palette = 'viridis').set_title('After Distribution')
    sns.boxplot(data = df, x = 'total_bedrooms', ax = ax[3], palette = 'cividis').set_title('Before Distribution')
    ax = np.reshape(ax, (2, 2))
    plt.tight_layout()
    plt.show()
    print("\n")
    
def Distribution2(columne,data,i):
    fig, ax = plt.subplots(1,3, figsize = (15,5))
    font_dict = {'fontsize': 14}
    title = ['Before Distribution','After Distribution']
    ax = np.ravel(ax)
    if i == 1:
        sns.set(style = 'whitegrid')
        sns.kdeplot(data = data, x = columne, ax = ax[0], color = 'r').set_title(title[i])
        sns.boxplot(data = data, x = columne, ax = ax[1], palette = 'magma').set_title(title[i])
        sns.scatterplot(data = data, x = columne, ax = ax[2], y = data['median_house_value'], color = 'r').set_title(title[i])
    else:
        sns.set(style = 'whitegrid')
        sns.kdeplot(data = data, x = columne, ax = ax[0], color = '#2171b5').set_title(title[i])
        sns.boxplot(data = data, x = columne, ax = ax[1], color = '#2171b5').set_title(title[i])
        sns.scatterplot(data = data, x = columne, ax = ax[2], y = data['median_house_value'],color='#2171b5').set_title(title[i])
    ax = np.reshape(ax, (1, 3))
    plt.tight_layout() 
    plt.show()
    print("\n")
    
choice = 0

while choice < 10:
    print("",'\033[1m'+'DATA SCIENCE MINI PROJECT on HOUSE PRICES'+'\033[0m')
    print('\033[1m'+'1'+'\033[0m', "Accessing the data")
    print('\033[1m'+'2'+'\033[0m', "Scatter plot accross the Map to see the house price")
    print('\033[1m'+'3'+'\033[0m', "Heatmap according to prices")
    print('\033[1m'+'4'+'\033[0m', "Data representation in Histogram")
    print('\033[1m'+'5'+'\033[0m', "Data representation in Boxplot")
    print('\033[1m'+'6'+'\033[0m', "Handling of missing data")
    print('\033[1m'+'7'+'\033[0m', "Distribution data represenation")
    print('\033[1m'+'8'+'\033[0m', "EXIT")
    
    choice=int(input("Enter your choice: "))
    
    if choice == 1:
        d(df)
    
    elif choice == 2:
        scatterplot()
    
    elif choice == 3:
        heatmap()
    
    elif choice == 4:
        histogram()
    
    elif choice == 5:
        boxplot()
    
    elif choice == 6:
        m_data()
    
    elif choice == 7:
        print("\n",'\033[1m'+'DISTRIBUTION OF DATA'+'\033[0m')
        dt(df)
        Distribution2(columne = 'total_bedrooms', data = data, i = 0)
        print(data[data['total_bedrooms'] >= 3000].shape)
        data = data[data['total_bedrooms'] < 3000]
        Distribution2(columne = 'total_bedrooms', data = data, i = 1)
        Distribution2(columne = 'total_rooms', data = data, i = 0)
        print(data[data['total_rooms'] >= 15000].shape)
        data = data[data['total_rooms'] < 15000]
        Distribution2(columne = 'total_rooms', data = data, i = 1)
        Distribution2(columne = 'housing_median_age', data = data, i = 0)
        Distribution2(columne = 'population', data = data, i = 0)
        data[data['population'] >= 6500].shape
        data = data[data['population'] < 6500]
        Distribution2(columne = 'population', data = data, i = 1)
        Distribution2(columne = 'households', data = data, i = 0)
        data[data['households'] >= 2000].shape
        data = data[data['households'] < 2000]
        Distribution2(columne = 'households', data = data, i = 1)
        Distribution2(columne = 'median_income', data = data, i = 0)
        data[data['median_income'] >= 9].shape
        data = data[data['median_income'] < 9]
        Distribution2(columne = 'median_income', data = data, i = 1)
        data.shape
    
    elif choice == 8:
        print("",'\033[1m'+'THANK YOU'+'\033[0m')
        break

 [1mDATA SCIENCE MINI PROJECT on HOUSE PRICES[0m
[1m1[0m Accessing the data
[1m2[0m Scatter plot accross the Map to see the house price
[1m3[0m Heatmap according to prices
[1m4[0m Data representation in Histogram
[1m5[0m Data representation in Boxplot
[1m6[0m Handling of missing data
[1m7[0m Distribution data represenation
[1m8[0m EXIT
Enter your choice: 1
 [1mACCESING THE DATA[0m

 [1mCSV Data[0m
       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23     37.88                41.0        880.0           129.0   
1        -122.22     37.86                21.0       7099.0          1106.0   
2        -122.24     37.85                52.0       1467.0           190.0   
3        -122.25     37.85                52.0       1274.0           235.0   
4        -122.25     37.85                52.0       1627.0           280.0   
...          ...       ...                 ...          ...             ...   
20635    -121.09     39.