In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

In [None]:
# 훈련데이터 로딩
train = pd.read_csv("C:/sample/bike/train.csv")
train.tail()

In [None]:
# 테스트 데이터 로딩
test = pd.read_csv("C:/sample/bike/test.csv")
test.tail()

In [None]:
# 결측치 체크(true = 1. false = 0) sum으로 체크
print(train.isnull().sum())
test.isnull().sum()

In [None]:
# 데이터 타입 확인
train.info()

In [None]:
train.tail()

In [None]:
train["count"].mean()

In [None]:
train["count"].max()

In [None]:
train["count"].min()

In [None]:
train["windspeed"].mean()

In [None]:
# 차트를 그려 정규분포인지 확인  아닐시 > 정규화
sns.distplot(train["count"])

In [None]:
train["count"]

In [None]:
sns.distplot(np.log1p(train["count"]))

In [None]:
train["count"] = np.log1p(train["count"])

In [None]:
train["count"].mean()

In [None]:
train["count"].max()

In [None]:
train["count"].min()

In [None]:
train["count"].describe()

In [None]:
train["windspeed"].describe()

In [None]:
# 훈련 데이터와 테스트 데이터를 데이터 프레임으로 합쳐서 전처리
train.shape

In [None]:
test.shape

In [None]:
# 각각의 데이터(행)의 수를 저장
ntrain = train.shape[0]
ntest = test.shape[0]

In [None]:
# 종속변수를 분리시켜 따로 저장
y_train = train["count"].values
y_train

In [None]:
# 훈련 데이터 + 테스트 데이터 > 전처리
all_data = pd.concat((train, test)).reset_index(drop = True)

In [None]:
all_data.tail()

In [None]:
# 불필요한 변수(독립변수가 아닌 변수)를 삭제(drop)
all_data.drop(["casual"], axis = 1, inplace = True) # axis = 0(행 삭제), axis = 1(열 삭제)

In [None]:
all_data.drop(["count", "registered"], axis = 1, inplace = True)

In [None]:
all_data.tail()

In [None]:
all_data.isnull().sum()

In [None]:
all_data.corr()

In [None]:
all_data.head()

In [None]:
# 날자 데이터 계산 패키지
from datetime import datetime

In [None]:
# 배열에 apply를 넣으면 모든 행에 적용 가능
# datatime에서 날자만 불리
all_data["date"] = all_data.datetime.apply(lambda x : x.split(" ")[0]) # lamda = 일회용 함수, .split() 안에 생략하면 자동으로 공백으로 자름

In [None]:
all_data.tail()

In [None]:
# datetime에서 시간만 분리
all_data["time"] = all_data.datetime.apply(lambda x : x.split()[1].split(":")[0])
all_data.head()

In [None]:
# date에서 요일을 추가
all_data["weekday"] = all_data.date.apply(lambda x :datetime.strptime(x, "%Y-%m-%d").weekday()) # str ptime

In [None]:
all_data.head()

In [None]:
# date에서 월을 분리해서 추가
all_data["month"] = all_data.date.apply(lambda x :datetime.strptime(x, "%Y-%m-%d").month)

In [None]:
all_data.head()

In [None]:
all_data.drop(["datetime"], axis = 1, inplace = True)

In [None]:
all_data.head()

In [None]:
ntrain

In [None]:
train = all_data[0:ntrain]

In [None]:
test = all_data[ntrain:]

In [None]:
train.tail()

In [None]:
test.head()

In [None]:
y_train

In [None]:
train["count"] = y_train

In [None]:
train.tail()

In [None]:
train.isnull().sum()

In [None]:
# 변수간의 상관관계 출력
train.corr()

In [None]:
sns.heatmap(data = train.corr(), annot = True, fmt = ".2f") # annot : 수치보기, fmt : 포맷팅

In [None]:
# 각각의 독립변수와 종속변수간의 상관관계를 시각화
# count ~ season
group_season = train.groupby(["season"])["count"].sum().reset_index() # reset_index() : 행과 열의 인덱스 추가
group_season

In [None]:
sns.barplot(x = group_season["season"], y = group_season["count"])

In [None]:
# count ~ weekday
group_weekday = train.groupby(["weekday"])["count"].sum().reset_index()
group_weekday

In [None]:
sns.barplot(x = group_weekday["weekday"], y = group_weekday["count"])

In [None]:
# count ~ month
group_month = train.groupby(["month"])["count"].sum().reset_index()
group_month

In [None]:
sns.barplot(x = group_month["month"], y = group_month["count"])

In [None]:
# count ~ time
group_time = train.groupby(["time"])["count"].sum().reset_index()
group_time

In [None]:
sns.barplot(x = group_time["time"], y = group_time["count"])

In [None]:
train.head()

In [None]:
# count ~ temp
group_temp = train.groupby(["temp"])["count"].sum().reset_index()
group_temp

In [None]:
sns.barplot(x = group_temp["temp"], y = group_temp["count"])

In [None]:
# count ~ humidity
group_humidity = train.groupby(["humidity"])["count"].sum().reset_index()
group_humidity

In [None]:
sns.barplot(x = group_humidity["humidity"], y = group_humidity["count"])

In [None]:
train["temp"].describe()

In [None]:
train["humidity"].describe()

In [None]:
train_test_data = [train, test] # 데이터 프레임 x, apply 사용 불가

In [None]:
# temp 범위 축소
for row in train_test_data:
    row.loc[row["temp"] <= 5, "temp"] = 0,
    row.loc[(row["temp"] > 5) & (row["temp"] <= 10), "temp"] = 1,
    row.loc[(row["temp"] > 10) & (row["temp"] <= 15), "temp"] = 2,
    row.loc[(row["temp"] > 15) & (row["temp"] <= 20), "temp"] = 3,
    row.loc[(row["temp"] > 20) & (row["temp"] <= 25), "temp"] = 4,
    row.loc[(row["temp"] > 25) & (row["temp"] <= 30), "temp"] = 5,
    row.loc[(row["temp"] > 30) & (row["temp"] <= 35), "temp"] = 6,
    row.loc[row["temp"] > 35 , "temp"] = 7

In [None]:
train.head()

In [None]:
test.head()

In [None]:
# count ~ temp
group_temp = train.groupby(["temp"])["count"].sum().reset_index()
group_temp

In [None]:
sns.barplot(x = group_temp["temp"], y = group_temp["count"])

In [None]:
# 습도범위 축소
for row in train_test_data:
    row.loc[row["humidity"] <= 10, "humidity"] = 0,
    row.loc[(row["humidity"] > 10) & (row["humidity"] <= 20), "humidity"] = 1,
    row.loc[(row["humidity"] > 20) & (row["humidity"] <= 30), "humidity"] = 2,
    row.loc[(row["humidity"] > 30) & (row["humidity"] <= 40), "humidity"] = 3,
    row.loc[(row["humidity"] > 40) & (row["humidity"] <= 50), "humidity"] = 4,
    row.loc[(row["humidity"] > 50) & (row["humidity"] <= 60), "humidity"] = 5,
    row.loc[(row["humidity"] > 60) & (row["humidity"] <= 70), "humidity"] = 6,
    row.loc[(row["humidity"] > 70) & (row["humidity"] <= 80), "humidity"] = 7,
    row.loc[(row["humidity"] > 80) & (row["humidity"] <= 90), "humidity"] = 8,
    row.loc[row["humidity"] > 90 , "humidity"] = 9

In [None]:
train.head()

In [None]:
group_humidity = train.groupby(["humidity"])["count"].sum().reset_index()
group_humidity

In [None]:
sns.barplot(x = group_humidity["humidity"], y = group_humidity["count"])

In [None]:
train.drop("atemp", axis = 1, inplace = True)

In [None]:
train.head()

In [None]:
test.drop("atemp", axis = 1, inplace = True)
test.head()

In [None]:
train.drop("date", axis = 1, inplace = True)
test.drop("date", axis = 1, inplace = True)

In [None]:
train.drop(["count"], axis = 1, inplace = True)

In [None]:
train.head()

In [None]:
# 3. 모델링
# 모델(알고리즘) 결정 -> LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# 모델 결정
model = LinearRegression()

In [None]:
# 훈련데이터를 이용하여 훈련(상수항 결합 -> 훈련)
result = model.fit(X = train, y = y_train)

In [None]:
# 훈련된 모델의 절편 & 계수
dir(result)
result.intercept_

In [None]:
model.coef_

In [None]:
# count = 3.045 + 0.05*holiday -0.15*humidity - ......

In [None]:
pre = model.predict(test)

In [None]:
pre

In [None]:
test["count"] = pre

In [None]:
test.head()

In [None]:
train_pre = model.predict(train)

In [None]:
train_pre

In [None]:
y_train

In [None]:
sns.scatterplot(y_train, train_pre)