# Linear Regression and Logistic Regression

In [1]:
# Packages
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Model selection and evaluation tools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
data=pd.read_csv('BatonRouge.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080 entries, 0 to 1079
Data columns (total 11 columns):
Price         1080 non-null int64
SQFT          1080 non-null int64
Bedrooms      1080 non-null int64
Baths         1080 non-null int64
Age           1080 non-null int64
Occupancy     1080 non-null int64
Pool          1080 non-null int64
Style         1080 non-null int64
Fireplace     1080 non-null int64
Waterfront    1080 non-null int64
DOM           1080 non-null int64
dtypes: int64(11)
memory usage: 92.9 KB


In [4]:
data.head()

Unnamed: 0,Price,SQFT,Bedrooms,Baths,Age,Occupancy,Pool,Style,Fireplace,Waterfront,DOM
0,66500,741,1,1,18,1,1,1,1,0,6
1,66000,741,1,1,18,2,1,1,0,0,23
2,68500,790,1,1,18,1,0,1,1,0,8
3,102000,2783,2,2,18,1,0,1,1,0,50
4,54000,1165,2,1,35,2,0,1,0,0,190


In [6]:
y = data['Price']
x = data['SQFT']

In [7]:
#model fitting find coefficient: finc beta0 and beta1
import statsmodels.api as sm


x_with_intercept = sm.add_constant(x, prepend=True)

x_with_intercept.head()

Unnamed: 0,const,SQFT
0,1.0,741
1,1.0,741
2,1.0,790
3,1.0,2783
4,1.0,1165


In [8]:
model = sm.OLS(y, x_with_intercept)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.579
Model:                            OLS   Adj. R-squared:                  0.578
Method:                 Least Squares   F-statistic:                     1480.
Date:                Sun, 07 Jun 2020   Prob (F-statistic):          1.54e-204
Time:                        11:03:43   Log-Likelihood:                -13722.
No. Observations:                1080   AIC:                         2.745e+04
Df Residuals:                    1078   BIC:                         2.746e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -6.086e+04   6110.187     -9.961      0.0

In [45]:
new_house = [1, 1500]
results.predict(new_house)

array([78259.59937286])

# Logistic Regression

C：类似于lasso和ridge里的λ，C越大越接近于纯逻辑回归（类比与λ越小越接近纯线性回归。）如果不填默认为1.0

penalty：l1类似于lasso，l2类似于ridge。如果不填，默认为l2。

class_weight: 在imbalance classification问题中，我们希望出现较少的那个class也能被充分考虑，可以将 class_weight 设置为 'balanced'，即 class_weight='balanced'。如果不填，默认为None，也就是我们最基本的逻辑回归。

In [35]:
from sklearn.datasets import load_breast_cancer
data_dict = load_breast_cancer()
X = data_dict.data
y = data_dict.target

In [36]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
print(lr_model)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


In [37]:
lr_model.fit(X,y) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [38]:
lr_model.coef_

array([[ 2.14255611e+00,  1.19509857e-01, -7.76065614e-02,
        -2.63952341e-03, -1.55150939e-01, -4.12221000e-01,
        -6.55000796e-01, -3.43525103e-01, -2.27461843e-01,
        -2.68287285e-02, -2.12777314e-02,  1.29020735e+00,
         1.95855311e-02, -9.66546553e-02, -1.68391816e-02,
         1.00285617e-03, -5.13777284e-02, -4.04958496e-02,
        -4.29002319e-02,  5.92543569e-03,  1.29601147e+00,
        -3.48075791e-01, -1.20237099e-01, -2.47712838e-02,
        -2.87288179e-01, -1.17582446e+00, -1.62124781e+00,
        -6.62235441e-01, -6.99945260e-01, -1.18401588e-01]])

In [None]:
lr_model.predict(x_test)