In [219]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.metrics import classification_report

In [193]:
df = pd.read_csv("meteo.csv")
print("Dataset size")
print("Rows {} Columns {}".format(df.shape[0], df.shape[1]))

Dataset size
Rows 142193 Columns 24


In [194]:
print("Columns and data types")
pd.DataFrame(df.dtypes).rename(columns = {0:'dtype'}) #tipos de datos

Columns and data types


Unnamed: 0,dtype
Date,object
Location,object
MinTemp,float64
MaxTemp,float64
Rainfall,float64
Evaporation,float64
Sunshine,float64
WindGustDir,object
WindGustSpeed,float64
WindDir9am,object


In [195]:
df.isnull().sum()

Date                 0
Location             0
MinTemp            637
MaxTemp            322
Rainfall          1406
Evaporation      60843
Sunshine         67816
WindGustDir       9330
WindGustSpeed     9270
WindDir9am       10013
WindDir3pm        3778
WindSpeed9am      1348
WindSpeed3pm      2630
Humidity9am       1774
Humidity3pm       3610
Pressure9am      14014
Pressure3pm      13981
Cloud9am         53657
Cloud3pm         57094
Temp9am            904
Temp3pm           2726
RainToday         1406
RISK_MM              0
RainTomorrow         0
dtype: int64

In [196]:
df = df.drop(['Date','Location','Evaporation','Sunshine','WindGustDir','WindGustSpeed','WindDir9am','WindDir3pm','WindSpeed9am','WindSpeed3pm',
             'Pressure9am','Pressure3pm','Cloud9am','Cloud3pm'], 1)

In [197]:
df.isnull().sum()

MinTemp          637
MaxTemp          322
Rainfall        1406
Humidity9am     1774
Humidity3pm     3610
Temp9am          904
Temp3pm         2726
RainToday       1406
RISK_MM            0
RainTomorrow       0
dtype: int64

In [198]:
MinTempMean=np.nanmean(df['MinTemp'])
MaxTempMean=np.nanmean(df['MaxTemp'])
RainfallMean=np.nanmean(df['Rainfall'])
Humidity9amMean=np.nanmean(df['Humidity9am'])
Humidity3pmMean=np.nanmean(df['Humidity3pm'])
Temp9amMean=np.nanmean(df['Temp9am'])
Temp3pmMean=np.nanmean(df['Temp3pm'])

In [199]:
def approx_MinTemp(fact):
    if pd.isnull(fact):
        return MinTempMean
    else:
        return fact

In [200]:
def approx_MaxTemp(fact):
    if pd.isnull(fact):
        return MaxTempMean
    else:
        return fact

In [201]:
def approx_Rainfall(fact):
    if pd.isnull(fact):
        return RainfallMean
    else:
        return fact

In [202]:
def approx_Humidity9am(fact):
    if pd.isnull(fact):
        return Humidity9amMean
    else:
        return fact

In [203]:
def approx_Humidity3pm(fact):
    if pd.isnull(fact):
        return Humidity3pmMean
    else:
        return fact

In [204]:
def approx_Temp9am(fact):
    if pd.isnull(fact):
        return Temp9amMean
    else:
        return fact

In [205]:
def approx_Temp3pm(fact):
    if pd.isnull(fact):
        return Temp3pmMean
    else:
        return fact

In [206]:
df['MinTemp'] = df['MinTemp'].apply(approx_MinTemp)
df['MaxTemp'] = df['MaxTemp'].apply(approx_MaxTemp)
df['Rainfall'] = df['Rainfall'].apply(approx_Rainfall)
df['Humidity9am'] = df['Humidity9am'].apply(approx_Humidity9am)
df['Humidity3pm'] = df['Humidity3pm'].apply(approx_Humidity3pm)
df['Temp9am'] = df['Temp9am'].apply(approx_Temp9am)
df['Temp3pm'] = df['Temp3pm'].apply(approx_Temp3pm)
df.isnull().sum()

MinTemp            0
MaxTemp            0
Rainfall           0
Humidity9am        0
Humidity3pm        0
Temp9am            0
Temp3pm            0
RainToday       1406
RISK_MM            0
RainTomorrow       0
dtype: int64

In [207]:
df.dropna(inplace=True)
df.isnull().sum()

MinTemp         0
MaxTemp         0
Rainfall        0
Humidity9am     0
Humidity3pm     0
Temp9am         0
Temp3pm         0
RainToday       0
RISK_MM         0
RainTomorrow    0
dtype: int64

In [208]:
pd.DataFrame(df.dtypes).rename(columns = {0:'dtype'}) #tipos de datos

Unnamed: 0,dtype
MinTemp,float64
MaxTemp,float64
Rainfall,float64
Humidity9am,float64
Humidity3pm,float64
Temp9am,float64
Temp3pm,float64
RainToday,object
RISK_MM,float64
RainTomorrow,object


In [209]:
RainToday = pd.get_dummies(df['RainToday'],drop_first=True)
df.drop(['RainToday'],axis=1,inplace=True)
df=pd.concat([df,RainToday],axis=1)

names = df.columns.tolist()
names[names.index('Yes')] = 'RainToday'
df.columns = names

In [210]:
RainTomorrow = pd.get_dummies(df['RainTomorrow'],drop_first=True)
df.drop(['RainTomorrow'],axis=1,inplace=True)
df=pd.concat([df,RainTomorrow],axis=1)

names = df.columns.tolist()
names[names.index('Yes')] = 'RainTomorrow'
df.columns = names

In [211]:
df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Humidity9am,Humidity3pm,Temp9am,Temp3pm,RISK_MM,RainToday,RainTomorrow
0,13.4,22.9,0.6,71.0,22.0,16.9,21.8,0.0,0,0
1,7.4,25.1,0.0,44.0,25.0,17.2,24.3,0.0,0,0
2,12.9,25.7,0.0,38.0,30.0,21.0,23.2,0.0,0,0
3,9.2,28.0,0.0,45.0,16.0,18.1,26.5,1.0,0,0
4,17.5,32.3,1.0,82.0,33.0,17.8,29.7,0.2,0,0


In [213]:
# Variable objetivo
X = df.ix[:,(0,1,2,3,4,5,6,7,8)].values
# Variables predictivas (o explicativas)
Y = df.ix[:,9].values

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


In [214]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .3, random_state=25)

In [215]:
#Aprendizaje
LogReg = LogisticRegression()
LogReg.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [216]:
print("coeficientes del modelo: "+str(LogReg.coef_))
print("intercept: "+str(LogReg.intercept_))

coeficientes del modelo: [[ 8.41321051e-02 -6.29783354e-02  1.13993395e-02 -1.59474511e-02
  -1.83435429e-02 -1.93787367e-02 -5.12931483e-02  1.65635285e+01
  -2.02825441e-01]]
intercept: [-14.29375895]


In [221]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     32853
           1       1.00      1.00      1.00      9384

   micro avg       1.00      1.00      1.00     42237
   macro avg       1.00      1.00      1.00     42237
weighted avg       1.00      1.00      1.00     42237

