In [260]:
#imports
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [261]:
%config IPCompleter.greedy=True

In [262]:
#dataset
PATH_1 = ("DatasetsImmo/immozip_oneHot.csv")
PATH_2 = ("DatasetsImmo/RA_immoZip.csv")

In [263]:
immo_classic = pd.read_csv(PATH_2)
immo_classic.drop("Unnamed: 0",inplace= True,axis=1)
print("immo_classic ={}".format(immo_classic.shape))

immo_classic =(11865, 22)


In [264]:
immo_one_hot = pd.read_csv(PATH_1)
immo_one_hot.drop("Unnamed: 0",inplace= True,axis=1)
print("immo_one_hot.shape ={}".format(immo_one_hot.shape))

immo_one_hot.shape =(11865, 62)


In [265]:
immo_one_hot.head(3)

Unnamed: 0,price,constructionYear,room_number,mètres carrés,prix mètres carrés,parking extérieur,atticExists,basementExists,hasSwimmingPool,zip,...,condition_is_to be done up,condition_is_to renovate,condition_is_to restore,heatingType_is_carbon,heatingType_is_electric,heatingType_is_fueloil,heatingType_is_gas,heatingType_is_pellet,heatingType_is_solar,heatingType_is_wood
0,1550000,2017.0,3.0,213,7277.0,1.0,False,False,False,1050,...,0,0,0,0,0,0,1,0,0,0
1,495000,1960.0,3.0,120,4125.0,0.0,False,True,False,1050,...,0,0,0,0,0,0,1,0,0,0
2,1537000,2015.0,2.0,160,9606.25,1.0,False,True,False,1050,...,0,0,0,0,0,0,1,0,0,0


# Drop Outlier

[Snippet](https://medium.com/datadriveninvestor/finding-outliers-in-dataset-using-python-efc3fce6ce32)

In [266]:
outliers=[]
def detect_outlier(data_1):
    
    threshold=3
    mean_1 = np.mean(data_1)
    std_1 =np.std(data_1)
    
    for y in data_1:
        z_score= (y - mean_1)/std_1 
        if np.abs(z_score) > threshold:
            outliers.append(y)
    return outliers

In [267]:
immo_classic.price.max()

9500000

In [268]:
immo_classic.price.min()

15000

In [270]:
price_outliers = detect_outlier(immo_classic.price)

In [272]:
#price_outliers

In [277]:
outliers =  immo_classic[immo_classic.price.isin(price_outliers)]

 > So now we can get the min outliers and the max then subsetting those values from our datas

In [292]:
min_outliers= outliers.price.min()
max_outliers= outliers.price.max()

In [296]:
immo_classic_no_big_outliers = immo_classic[immo_classic["price"] < min_outliers]

In [297]:
immo_classic_no_big_outliers.price.max()

2095000

# Using IQR

IQR tells how spread the middle values are. It can be used to tell when a value is too far from the middle.
An outlier is a point which falls more than 1.5 times the interquartile range above the third quartile or below the first quartile.

##  step 1:
- Arrange the data in increasing order
- Calculate first(q1) and third quartile(q3)
- Find interquartile range (q3-q1)
- Find lower bound q1*1.5
- Find upper bound q3*1.5
- Anything that lies outside of lower and upper bound is an outlier

First sorting the dataset

In [302]:
immo_classic.sort_values(by="price",axis=0,ascending=True,inplace= True)

Finding first quartile and third quartile

In [304]:
q1, q3= np.percentile(immo_classic.price,[25,75])

Find the IQR which is the difference between third and first quartile

In [306]:
iqr = q3 - q1
iqr

265000.0

Find lower and upper bound

In [307]:
lower_bound = q1 -(1.5 * iqr) 
upper_bound = q3 +(1.5 * iqr)

In [308]:
lower_bound

-162500.0

In [309]:
upper_bound 

897500.0

## How to Calculate the Confidence Interval?
The interval is calculated using the following steps:

1. Gather the sample data.
2. Calculate the sample mean x̅.
3. Determine whether a population’s standard deviation is known or unknown.
4. If a population’s standard deviation is known, we can use a z-score for the corresponding confidence level.
5. If a population’s standard deviation is unknown, we can use a t-statistic for the corresponding confidence level.
6. Find the lower and upper bounds of the confidence interval using the following formulas:

- a. Known population standard deviation
![alt text](https://cdn.corporatefinanceinstitute.com/assets/confidence-interval1.png)

 - b. Unknown population standard deviation

![alt text](https://cdn.corporatefinanceinstitute.com/assets/confidence-

# Dates stuff

In [None]:
immo_classic.dtypes

In [None]:
immo_classic.constructionYear.astype("str")

In [None]:
immo_classic.constructionYear = pd.to_datetime(immo_classic.constructionYear,format="%Y")

In [None]:
immo_classic.constructionYear

In [None]:
immo_classic.dtypes

In [None]:
immo_classic= immo_classic.set_index("constructionYear")

In [None]:
immo_classic.index.has_duplicates

In [None]:
immo_classic.sort_index(ascending = True).head()

In [None]:
immo_classic_year= immo_classic.resample('A').mean()

In [None]:
immo_classic_year.head()

In [None]:
immo_classic_year.tail()

In [None]:
immo_classic_year.columns

In [None]:
# makes the plot and assign it to a variable
appl_open = immo_classic_year['price'].plot(title = "price")

# changes the size of the graph
fig = appl_open.get_figure()
fig.set_size_inches(13.5, 9)