# Construirea unui model matematic de regresie liniara multivariata

`M`achine `L`earning, `A`rtificial `I`nteligence

## 1. Importuri necesare, setari

In [34]:
# required imports
import gzip
import json
import os
import re

# multi-array stuff
import pandas as pd
import numpy as np

# plot stuff
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sea

# import and set no warnigns
import warnings
warnings.filterwarnings('ignore')

# set pd max cols / rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

## 2. Incarcare, parsare, modelare date AUTOVIT.RO

In [35]:
def strip_nr(str_nr):
    if str_nr is not None:
        return int(re.sub(r'[^0-9]+', '', str_nr))
    else:
        raise TypeError
    
def get_listing_param(data, name):
    for p in data['listing_params']:
        if p[0] == name:
            return p[1]
    else:
        return None

def get_data():
    raw_adverts = []
    for root, dirs, files in os.walk('../database/autovit/adverts/'):
        for name in files:
            file = os.path.join(root, name)
            with gzip.open(file, 'r') as fin:
                raw_adverts.extend(json.loads(fin.read().decode('utf-8')))
    print('** Total loaded adverts: {} **'.format(len(raw_adverts)))
    raw_adverts[0].pop('photos', None)
    return raw_adverts

def parse_data(raw_adverts):
    parsed_adverts = []
    for ad in raw_adverts:
        # Ad has req data, essential
        if 'list_label' in ad \
            and get_listing_param(ad, 'Anul fabricatiei') is not None \
            and get_listing_param(ad, 'Kilometraj') is not None \
            and get_listing_param(ad, 'Combustibil') is not None \
            and 'features' in ad \
            and 'params' in ad:

            # reshape params data
            if 'params' in ad:
                for p in ad['params']:
                    ad[p[0]] = p[1]
                # pop extra params
                ad.pop('params', None)

            # reshape features
            if 'features' in ad:
                for f in ad['features']:
                    ad[f] = True
                # pop extra features
                ad.pop('features', None)

            # format price as int only
            if 'list_label' in ad:
                ad['price'] = strip_nr(ad['list_label'])
                ad.pop('list_label', None)
            else:
                print('No list label')

            ad['fabrication_year'] = strip_nr(get_listing_param(ad, 'Anul fabricatiei'))
            ad['kilometers'] = strip_nr(get_listing_param(ad, 'Kilometraj'))
            ad['fuel'] = get_listing_param(ad, 'Combustibil').lower()
            ad['brand_model'] = '{}-{}'.format(ad['Marca'], ad['Model'])

            # Params
            hp = ad['Putere'] if 'Putere' in ad else None
            if hp is not None:
                hp = strip_nr(hp.replace('CP', ''))
                ad['horse_power'] = hp

            co2 = ad['Emisii CO2'] if 'Emisii CO2' in ad else None
            if co2 is not None:
                co2 = strip_nr(co2.replace('g/km', ''))
                ad['emissions'] = co2

            # Listing params
            cc = get_listing_param(ad, 'Capacitate cilindrica')
            if cc is not None:
                cc = strip_nr(cc.replace('cm3', ''))
                ad['cubic_capacity'] = cc

            # pop unused keys
            ad.pop('photos', None)
            ad.pop('description', None)

            # append to list
            parsed_adverts.append(ad)

    return parsed_adverts


adverts = parse_data(get_data())
print('** Parsed usable adverts: {} **'.format(len(adverts)))

** Total loaded adverts: 8306 **
** Parsed usable adverts: 7076 **


### Informatii despre dataset
`count`, `mean`, `min`, `max`, `std = standard deviation`

In [36]:
df = pd.DataFrame(adverts)
df.describe()

Unnamed: 0,age,highlighted,urgent,topAd,category_id,business,hide_user_ads_button,dealer_logo_in_results,dealer_logo_in_ad_page,has_phone,has_email,map_radius,map_private_radius_size,ad_featured,price,fabrication_year,kilometers,horse_power,cubic_capacity,emissions
count,7076.0,7076.0,7076.0,7076.0,7076.0,7076.0,7076.0,7076.0,7076.0,7041.0,7076.0,7076.0,7076.0,7076.0,7076.0,7076.0,7076.0,6971.0,6930.0,4300.0
mean,72.856416,0.0,0.0,0.029112,29.0,0.526286,0.0,0.405879,0.44446,1.0,1.0,0.044941,2.0,0.0,17453.285331,2013.341012,148365.3,162.811218,1979.163348,140.655581
std,161.639753,0.0,0.0,0.168134,0.0,0.499344,0.0,0.491096,0.496941,0.0,0.0,0.207188,0.0,0.0,24438.151689,4.861911,99159.95,77.720817,608.129025,42.736634
min,0.0,0.0,0.0,0.0,29.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,450.0,1982.0,1.0,2.0,599.0,1.0
25%,0.0,0.0,0.0,0.0,29.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,5600.0,2010.0,78083.25,110.0,1598.0,114.0
50%,28.0,0.0,0.0,0.0,29.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,9999.0,2014.0,157435.5,147.0,1968.0,135.0
75%,69.0,0.0,0.0,0.0,29.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,2.0,0.0,19950.0,2017.0,211000.0,190.0,2000.0,159.0
max,4039.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,839496.0,2021.0,2690000.0,780.0,6749.0,1329.0


### Randurile si coloanele datasetului

In [37]:
df.head(5)

Unnamed: 0,id,region_id,region_name,city_id,city_name,district_id,district_name,new_used,url,preview_url,title,title_description_full,created,age,highlighted,urgent,topAd,category_id,ad_packages,ad_features,subtitle,business,hide_user_ads_button,has_dealer_page_feature,status,dealer_logo_in_results,dealer_logo_in_ad_page,dealer_logo,stand_id,map_address,external_partner_code,partner_offer_url,fuel_type,is_promoted,dealer_info,header,header_type,has_phone,has_email,listing_params,map_zoom,map_lat,map_lon,map_radius,map_show_detailed,map_private_radius_size,show_map_for_private,city_label,person,user_label,user_ads_id,user_id,numeric_user_id,user_ads_url,list_label_ad,list_label_small,ad_homepage,ads_bighomepage,ad_featured,hasDealerPackageMapFeature,badges,Categorie,Adauga URL video YouTube,Marca,Model,Anul fabricatiei,Kilometraj,Combustibil,Putere,Capacitate cilindrica,Transmisie,Cutie de viteze,Norma de poluare,Caroserie,Numar de portiere,Culoare,Vopsea metalizata,Eligibil pentru finantare,Garantie dealer (inclusa in pret),Tara de origine,Fara accident in istoric,Carte de service,Numar anunt,Stare,ABS,Airbag-uri frontale,Airbag-uri laterale fata,CD,Computer de bord,Controlul stabilitatii (ESP),Geamuri fata electrice,Inchidere centralizata,Radio,Servodirectie,Aer conditionat,Aer conditionat doua zone,Airbag genunchi sofer,Airbag-uri cortina,Airbag-uri laterale spate,Bluetooth,Comenzi volan,Controlul tractiunii (ASR),Faruri automate,Faruri Xenon,Geamuri cu tenta,Geamuri spate electrice,Jante din aliaj usor,Lumini de zi (LED),Navigatie GPS,Oglinda retrovizoare interioara electrocromatica,Oglinzi retrovizoare exterioare electrocromatice,Oglinzi retrovizoare incalzite,Pilot automat,Proiectoare ceata,Scaune fata incalzite,Senzori parcare spate,Stergatoare parbriz automate,price,fabrication_year,kilometers,fuel,brand_model,horse_power,cubic_capacity,Emisii CO2,Filtru de particule,Inmatriculat,Primul proprietar,Aer conditionat patru zone,Alarma,DVD,Geamuri laterale spate fumurii,Interior din piele,Limitator de viteza,Oglinzi retrovizoare ajustabile electric,Parbriz incalzit,Senzori parcare fata-spate,emissions,Versiune,Generatie,Data primei inmatriculari,Interior din velur,Imobilizator electronic,Intrare auxiliara,VIN,Tuning,Bare longitudinale acoperis,Camera parcare spate,Head-up display,Suspensie reglabila,Carlig remorca,Incalzire auxiliara,dealer_website,Acoperis panoramic,Garantie de la producator pana la,Scaune spate incalzite,Trapa,sau in limita a,Vopsea mata,TV,Masina de epoca,Vopsea nemetalizata,brand_program_id,brand_program_small_logo,Avariata,Volan pe dreapta,Unnamed: 167,Predare leasing,Plata initiala (la predare),Valoare rata lunara,Numar de rate lunare ramase,Valoare reziduala
0,7043759122,13,Timis,97411,Remetea Mare,,,used,https://www.autovit.ro/anunt/audi-a6-ID7GGTv6....,https://www.autovit.ro/i2/anunt/audi-a6-ID7GGT...,Audi A6,Audi A6,Azi 11:10,61,0,0,1,29,Gold,"[add_olx, vas_bundle_2]",[],1,0,False,active,1,1,https://ireland.apollo.olxcdn.com/v1/files/eyJ...,,,,,"{'key': 'diesel', 'value': 'Diesel'}",True,"{'userType': 'Dealer', 'reliabilityBadgeUrl': ...",Anunturile zilei,promoted,1.0,1,"[[Anul fabricatiei, 2010 ], [Kilometraj, 294 0...",12,45.78521193,21.39325281,0,True,2,False,Remetea Mare,Roland Auto TM,Roland Auto TM,ciSm,ciSm,2932498,https://www.autovit.ro/i2/anunturi/user/ciSm/?...,6 990 EUR,\nSe emite factura,False,False,0,True,{'fast_response': True},Autoturisme,https://youtu.be/xqJv5Wn4gmY,Audi,A6,2010,294 000 km,Diesel,170 CP,1 968 cm3,Fata,Automata,Euro 5,Combi,5.0,Gri,Da,Da,12 luni,Germania,Da,Da,7043759122,Second hand,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,6990,2010,294000,diesel,Audi-A6,170.0,1968.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,7044506685,42,Teleorman,69925,Videle,,,used,https://www.autovit.ro/anunt/audi-a4-ID7GK1Zz....,https://www.autovit.ro/i2/anunt/audi-a4-ID7GK1...,Audi A4,Audi A4,Ieri 15:07,1,0,0,1,29,,"[add_olx, topads_15]",[],0,0,False,active,0,0,,,,,,"{'key': 'diesel', 'value': 'Diesel'}",True,"{'userType': 'Privat', 'phones': ['072'], 'use...",,,1.0,1,"[[Anul fabricatiei, 2011 ], [Kilometraj, 298 3...",12,44.2663,25.539,0,True,2,True,Videle,Nikolas,Nikolas,faA,faA,58316,https://www.autovit.ro/i2/anunturi/user/faA/?j...,8 950 EUR,Negociabil\n,False,False,0,False,,Autoturisme,,Audi,A4,2011,298 350 km,Diesel,143 CP,1 998 cm3,4x4-manual,Manuala,Euro 5,Sedan,,Maro,,,,Germania,Da,Da,7044506685,Second hand,True,True,True,True,True,True,True,True,True,True,True,True,True,,True,True,True,True,True,True,True,True,True,True,True,True,,True,True,True,True,True,True,8950,2011,298350,diesel,Audi-A4,143.0,1998.0,152 g/km,Da,Da,Da,True,True,True,True,True,True,True,True,True,152.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,7044051670,35,Dambovita,63351,Targoviste,,,used,https://www.autovit.ro/anunt/audi-a4-2-0-ID7GI...,https://www.autovit.ro/i2/anunt/audi-a4-2-0-ID...,Audi A4 2.0,Audi A4 2.0,Ieri 13:32,31,0,0,1,29,,"[add_olx, topads_15]",[],0,0,False,active,0,0,,,,,,"{'key': 'diesel', 'value': 'Diesel'}",True,"{'userType': 'Privat', 'phones': ['072'], 'use...",,,1.0,1,"[[Anul fabricatiei, 2013 ], [Kilometraj, 206 0...",13,44.9306,25.46067,0,True,2,True,Targoviste,liviu,liviu,8gK,8gK,31790,https://www.autovit.ro/i2/anunturi/user/8gK/?j...,9 599 EUR,Negociabil\n,False,False,0,False,,Autoturisme,,Audi,A4,2013,206 000 km,Diesel,143 CP,1 968 cm3,Fata,Manuala,Euro 5,Sedan,5.0,Argint,Da,,,Germania,Da,Da,7044051670,Second hand,True,True,True,True,True,True,True,True,True,,,,,,,,,,,,,,,,,,,,,,,,,9599,2013,206000,diesel,Audi-A4,143.0,1968.0,149 g/km,Da,Da,,,,,,,,,,,149.0,2.0 TDI,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,7044056336,12,Sibiu,34907,Talmaciu,,,used,https://www.autovit.ro/anunt/volkswagen-caddy-...,https://www.autovit.ro/i2/anunt/volkswagen-cad...,Volkswagen Caddy Maxi,Volkswagen Caddy Maxi,Azi 13:36,29,0,0,1,29,Gold,"[add_olx, olx_homepage_30, topads_30]",[],1,0,False,active,1,1,https://ireland.apollo.olxcdn.com/v1/files/eyJ...,,,,,"{'key': 'diesel', 'value': 'Diesel'}",True,"{'userType': 'Dealer', 'reliabilityBadgeUrl': ...",Anunturile zilei,promoted,1.0,1,"[[Anul fabricatiei, 2013 ], [Kilometraj, 139 7...",13,45.6667,24.2611,0,True,2,False,Talmaciu,Auto 24,Auto 24,bYC8,bYC8,2854612,https://www.autovit.ro/i2/anunturi/user/bYC8/?...,9 500 EUR,\n,False,False,0,True,{'fast_response': True},Autoturisme,,Volkswagen,Caddy,2013,139 795 km,Diesel,140 CP,1 968 cm3,Fata,Automata,Euro 5,Monovolum,5.0,Negru,,,,,,,7044056336,Second hand,True,True,True,True,True,True,True,True,True,,,,,,,,,,,,,,,,,,,,,,,,,9500,2013,139795,diesel,Volkswagen-Caddy,140.0,1968.0,158 g/km,,,,,,,,,,,,,158.0,Maxi 2.0 TDI,Maxi Life,01/04/2013,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,7044521035,34,Valcea,89649,Ramnicu Valcea,,,used,https://www.autovit.ro/anunt/opel-corsa-1-3-ID...,https://www.autovit.ro/i2/anunt/opel-corsa-1-3...,Opel Corsa 1.3,Opel Corsa 1.3,Ieri 17:28,0,0,0,1,29,,"[add_olx, topads_15]",[],0,0,False,active,0,0,,,,,,"{'key': 'diesel', 'value': 'Diesel'}",True,"{'userType': 'Privat', 'phones': ['072'], 'use...",,,1.0,1,"[[Anul fabricatiei, 2009 ], [Kilometraj, 100 0...",13,45.10472,24.37556,0,True,2,True,Ramnicu Valcea,Mădălin,Mădălin,bEfh,bEfh,2776315,https://www.autovit.ro/i2/anunturi/user/bEfh/?...,4 000 EUR,Negociabil\n,False,False,0,False,,Autoturisme,,Opel,Corsa,2009,100 000 km,Diesel,90 CP,1 248 cm3,Fata,Automata,,Masina de oras,5.0,Gri,,,,,,,7044521035,Second hand,True,True,True,True,True,True,True,True,,True,True,,True,,True,True,True,,True,True,,,,,True,,,,True,,,True,,4000,2009,100000,diesel,Opel-Corsa,90.0,1248.0,,,,,,,,True,,,,,,,1.3 CDTI,,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Stergerea anomaliilor

In [81]:
def remove_outliers(x):
    Q1 = df[x].quantile(0.25)
    Q3 = df[x].quantile(0.75)
    IQR = Q3 - Q1 # IQR is interquartile range. 
    filter = (df[x] >= Q1 - 1.5 * IQR) & (df[x] <= Q3 + 1.5 * IQR)
    return df.loc[filter]  

df = remove_outliers('price')
df = remove_outliers('kilometers')
df = remove_outliers('fabrication_year')
df = remove_outliers('cubic_capacity')
df = remove_outliers('horse_power')
df = remove_outliers('emissions')

## 3. Ipoteza in urma cercetarii statisticilor

### Parametrii semnificativi (analiza vizuala):

- [x] price
- [x] ```'{}_{}'.format(brand, model)```
- [x] fuel 
- [x] body
- [x] kilometers (coef. negativ, relatie inversa)
- [x] horse_power
- [x] fabrication_year

## 4. Construirea unui sub-dataset

Pentru `dataset`-ul acestui model sunt folosite doar 3 caracteristici de antrenare, si 1 caracteristica pentru prezicere

```py
# Example
X['kilometers', 'horse_power', 'fabrication_year'] # train set
y['price'] # predict set
```

In [87]:
dtype = [
    'price', 
    'kilometers',  
    'horse_power', 
    'fabrication_year', 
    'Marca',
    'brand_model'
]

df_sub = df[dtype]
df_sub = df_sub.where(df_new['Marca'] == 'BMW')
df_sub = df_sub.drop(['Marca'], axis=1)
df_sub = df_sub.dropna()
df_sub.head()

Unnamed: 0,price,kilometers,horse_power,fabrication_year,brand_model
6,21000.0,235000.0,184.0,2014.0,BMW-Seria 5
122,8990.0,219978.0,177.0,2009.0,BMW-Seria 3
166,8000.0,269000.0,184.0,2011.0,BMW-Seria 3
168,20700.0,129000.0,190.0,2016.0,BMW-X3
175,24490.0,130000.0,190.0,2017.0,BMW-Seria 5


In [88]:
# adjusted dtype for later use
dtype = [
    'price', 
    'kilometers',  
    'horse_power', 
    'fabrication_year', 
    'brand_model'
]

df_sub.head()

Unnamed: 0,price,kilometers,horse_power,fabrication_year,brand_model
6,21000.0,235000.0,184.0,2014.0,BMW-Seria 5
122,8990.0,219978.0,177.0,2009.0,BMW-Seria 3
166,8000.0,269000.0,184.0,2011.0,BMW-Seria 3
168,20700.0,129000.0,190.0,2016.0,BMW-X3
175,24490.0,130000.0,190.0,2017.0,BMW-Seria 5


## 5. Conversia datelor categorice in date ordinare

In [89]:
# Defining the map function
def dummies(x):
    temp = pd.get_dummies(df_model[x], drop_first = True)
    df_dummies = pd.concat([df_model, temp], axis = 1)
    df_dummies.drop([x], axis = 1, inplace = True)
    return df_dummies

# df_model = dummies('brand_model')

df_model['brand_model'] = df_sub['brand_model'].astype('category')
df_model.dtypes

dtype('O')

In [90]:
df_model['brand_model_id'] = df_model['brand_model'].cat.codes
df_model.head()

6      BMW-Seria 5
122    BMW-Seria 3
166    BMW-Seria 3
168         BMW-X3
175    BMW-Seria 5
Name: brand_model, dtype: object

### Forma dataset-ului

In [91]:
df_model.shape

(316,)

## 6. Impartirea dataset-ului in doua dataset-uri

```py
df_train = 0.8 * df # used to train the model
df_test  = 0.2 * df # used to test the model
```

In [92]:
from sklearn.model_selection import train_test_split

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)


# clean of NaN, inf, -inf
df_model = clean_dataset(df_model)
np.random.seed(0)

# split data 
df_train, df_test = train_test_split(df_model, train_size = 0.8, test_size = 0.2, random_state = 100)

AssertionError: df needs to be a pd.DataFrame

## 7. Scalarea / uniformizarea datelor (0.0 - 1.0)

In [31]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_train[dtype] = scaler.fit_transform(df_train[dtype])

KeyError: "['brand_model'] not in index"

In [None]:
df_train.describe()

### Intercorelarea parametriilor

In [None]:
plt.figure(figsize = (12, 10))
sea.heatmap(df_train.corr(), annot = True, cmap="YlOrRd")
plt.show()

## 8. Impartirea datasetului df_train in X, y

In [None]:
# X = columns to form ecuation 
X_train = df_train
# y = column 'price', to form eq_res == y
y_train = df_train.pop('price')

if np.any(np.isnan(X_train)) or np.any(np.isnan(y_train)):
    raise Exception

X_train.head()

## 9. Crearea modelului de regresie liniara si eliminarea caracteristicilor recursive

 `RFE` =  Recursive Feature Elimination, is a feature selection algorithm.  

> Feature selection refers to techniques that select a subset of the most relevant features (columns) for a dataset.  
> Fewer features can allow machine learning algorithms to run more efficiently (less space or time complexity) and be more effective

In [None]:
# Recursive feature elimination
# Pruning low correlation coef. features
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor

lm = LinearRegression()
lm.fit(X_train, y_train)
rfe = RFE(lm, 10, verbose=1)
rfe = rfe.fit(X_train, y_train)

In [None]:
# Show list with supported features and ranking
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
X_train.columns[rfe.support_]

### X_train_rfe = datasetul nou

In [None]:
X_train_rfe = X_train[X_train.columns[rfe.support_]]
X_train_rfe.head()

## 10. Calcularea efectiva a modelului de Regresie Liniara cu datasetul X_train_rfe

In [None]:
def build_model(X, y):
    X = sm.add_constant(X) # adding the constant col of 1 1 1 1 1
    lm = sm.OLS(y, X).fit() # fitting the model
    print(lm.summary()) # model summary
    return X

In [None]:
X_train_new = build_model(X_train_rfe, y_train)

##### Parametri cu o valoare Prob (F-statistic) > 0.05 (alfa) trebuie eliminati

> The significance level α (alfa) is the probability of making the wrong decision when the null hypothesis is true. 

In [None]:
# X_train_new = X_train_new.drop(['cubic_centimeters'], axis = 1)
# X_train_new = build_model(X_train_new, y_train)

## 11. Calculul factorului de inflatie a variatiei

`VIF` = Variance Inflation Factor, for one exogenous variable.

> The variance inflation factor is a measure for the increase of the variance of the parameter estimates if an additional variable, given by exog_idx is added to the linear regression.  
> It is a measure for multicollinearity of the design matrix, exog.  
> 
> One recommendation is that if VIF is greater than 5, then the explanatory variable given by exog_idx is highly collinear with the other explanatory variables.  
> The parameter estimates will have large standard errors because of this.

In [None]:
def checkVIF(X):
    vif = pd.DataFrame()
    vif['Features'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    return(vif)

checkVIF(X_train)

### Deleting > 5 values

In [None]:
# X_train_new = X_train_new.drop(['horse_power'], axis = 1)
# X_train_new = build_model(X_train_new, y_train)

# checkVIF(X_train_new)

## 12. Analiza distributiei erorilor, y_train vs y_train_predict

Se analizeaza erorile intre pretul estimat din `y_train_predict` vs pretul real din `y_train`

In [None]:
lm = sm.OLS(y_train, X_train_new).fit()
y_train_predict = lm.predict(X_train_new)

# Plot the histogram of the error terms
fig = plt.figure()
sea.distplot((y_train - y_train_predict), bins = 30)
fig.suptitle('Distributia Erorilor', fontsize = 16)
plt.xlabel('Erori', fontsize = 14)
plt.show()

Termenii de eroare par a fii aproximativ normal distribuiti, ipoteza unui model liniar se adevereste.

## 13. Preziceri pe datasetul df_test

In [None]:
# Scaling the test set
df_test[dtype] = scaler.fit_transform(df_test[dtype])

In [None]:
# Dividing into X and y
X_test = df_test
y_test = df_test.copy().pop('price')

In [None]:
# Now let's use our model to make predictions.
X_train_new = X_train_new.drop('const', axis=1)

# Creating X_test_new dataframe by dropping variables from X_test
X_test_new = X_test[X_train_new.columns]

# Adding a constant variable 
X_test_new = sm.add_constant(X_test_new)

In [None]:
# Making predictions
y_pred = lm.predict(X_test_new)

## 14. Evaluarea modelului

### R^2, coeficientul de determinare

`R^2` = (coefficient of determination) regression score function

>Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse).   
> 
>A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.

In [None]:
from sklearn.metrics import r2_score 
r2 = r2_score(y_test, y_pred)

print('r^2 = {}'.format(r2))

### Reprentarea grafica y_test vs y_pred

In [None]:
fig = plt.figure()
plt.scatter(y_test, y_pred)
fig.suptitle('y_test vs y_pred', fontsize=18)
plt.xlabel('y_test', fontsize=14)
plt.ylabel('y_pred', fontsize=14)
plt.show()

### Descrierea modelului

In [None]:
print(lm.summary())

In [None]:
print('R-squred and Adjusted R-squared ~= {}'.format(r2))

F-statistic > 2500  
Prob(F-statistic) (overall model fit) == 0

> The F statistic must be used in combination with the p value when you are deciding if your overall results are significant. Why? If you have a significant result, it doesn’t mean that all your variables are significant. The statistic is just comparing the joint effect of all the variables together.
>
> 1. If the p value is less than the alpha level, go to Step 2 (otherwise your results are not significant and you cannot reject the null hypothesis). A common alpha level for tests is 0.05.
>
> 2. Study the individual p values to find out which of the individual variables are statistically significant.