In [158]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [159]:
weather_data = pd.read_csv('weatherAUS.zip')
weather_data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [160]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [161]:
weather_data.isnull().sum().sum()

343248

In [162]:
# Вычисляем процент пропущенных значений для каждого признака
missing_values_percent = weather_data.isnull().mean() * 100

# Создаем список признаков, в которых более 40% значений отсутствуют
columns_to_drop = missing_values_percent[missing_values_percent > 40].index.tolist()
print(columns_to_drop)

# Удаляем указанные признаки из DataFrame
df = weather_data.drop(columns=columns_to_drop)

['Evaporation', 'Sunshine', 'Cloud3pm']


In [163]:
df['RainToday'].isnull().sum()

3261

In [164]:
df['RainToday'].value_counts()

RainToday
No     110319
Yes     31880
Name: count, dtype: int64

In [165]:
def rain(x):
    if x == 'Yes':
        return 1
    elif x == 'No':
        return 0
    else:
        return np.nan 

In [166]:
df['RainToday'] = df['RainToday'].apply(rain)
df['RainTomorrow'] = df['RainTomorrow'].apply(rain)

In [167]:
df['RainToday'].value_counts()

RainToday
0.0    110319
1.0     31880
Name: count, dtype: int64

In [168]:
# Вычислить среднее арифметическое для преобразованного признака RainToday
mean_RainToday = df['RainToday'].mean()
mean_RainToday_rounded = round(mean_RainToday, 2)
print(mean_RainToday_rounded)

0.22


In [169]:
# Преобразуем признак Date в формат даты
df['Date'] = pd.to_datetime(df['Date'])

# Создадим новый признак Month, содержащий номер месяца
df['Month'] = df['Date'].dt.month

# Удалим исходный признак Date
df = df.drop('Date', axis=1)

In [170]:
# Подсчитываем долю дождливых дней относительно всех дней для каждого месяца
rainy_days_percent = df.groupby('Month')['RainToday'].apply(lambda x: (x == 1).mean())

# Находим месяц с самой большой долей дождливых дней
max_rainy_month = rainy_days_percent.idxmax()

# Выводим порядковый номер месяца с самой большой долей дождливых дней
print(max_rainy_month)

7


In [171]:
categoricals = ['Month', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']
dummy_variables = pd.get_dummies(df, columns=categoricals)

In [172]:
dummy_variables

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,...,False,False,False,False,False,False,False,False,True,False
1,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,...,False,False,False,False,False,False,False,False,False,True
2,12.9,25.7,0.0,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,...,False,False,False,False,False,False,False,False,False,True
3,9.2,28.0,0.0,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,...,False,False,False,False,False,False,False,False,False,False
4,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,...,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2.8,23.4,0.0,31.0,13.0,11.0,51.0,24.0,1024.6,1020.3,...,False,False,False,False,False,False,False,False,False,False
145456,3.6,25.3,0.0,22.0,13.0,9.0,56.0,21.0,1023.5,1019.1,...,False,False,False,False,False,False,False,False,False,False
145457,5.4,26.9,0.0,37.0,9.0,9.0,53.0,24.0,1021.0,1016.8,...,False,False,False,False,False,False,False,False,True,False
145458,7.8,27.0,0.0,28.0,13.0,7.0,51.0,24.0,1019.4,1016.5,...,False,False,False,False,False,False,False,False,False,False


In [173]:
# Удаление строк, где есть пропуски
cleaned_data = dummy_variables.dropna()

# Разбиение данных на обучающую и тестовую выборки
X = cleaned_data.drop('RainTomorrow', axis=1)  # признаки
y = cleaned_data['RainTomorrow']  # целевая переменная

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=31)

# Вычисление среднего значения целевой переменной на тестовой выборке
mean_target_variable = y_test.mean()
print(round(mean_target_variable, 2))

0.23


In [174]:
# Зафиксируем случайность
np.random.seed(31)

# Генерируем 1000 случайных выборок
sample_means = []
for _ in range(1000):
    indices = np.random.randint(0, len(X_train), len(X_train))  # Генерируем случайные индексы
    sample = X_train.iloc[indices]  # Извлекаем соответствующие элементы выборки
    sample_mean = sample['MinTemp'].mean()  # Вычисляем среднее значение для каждой выборки
    sample_means.append(sample_mean)

# Вычисляем стандартное отклонение для средних значений выборок
std_dev = np.std(sample_means)
print(round(std_dev, 2))

0.03


In [175]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)
roc_auc_score(y_test, preds_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7264029366389834

In [176]:
params = {'max_leaf_nodes': list(range(2, 10)), 'min_samples_split': [2, 3, 4], 'max_depth': [5,7,9,11]}

In [177]:
# Создание экземпляра модели дерева решений
dr = DecisionTreeClassifier(random_state=42)

cv = StratifiedKFold(n_splits=3)
# Создание экземпляра GridSearchCV
grid_search = GridSearchCV(estimator=dr, param_grid=params, cv=cv)
# Выполнение поиска по сетке
grid_search.fit(X_train, y_train)

# Получение наилучших параметров и оценки
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Вывод результатов
print("Наилучшие параметры:", best_params)
print("Лучшая оценка (roc_auc_score:", best_score)

Наилучшие параметры: {'max_depth': 5, 'max_leaf_nodes': 9, 'min_samples_split': 2}
Лучшая оценка (roc_auc_score: 0.8372916567564265


In [178]:
dr = DecisionTreeClassifier(random_state=42, max_depth=5, max_leaf_nodes=9, min_samples_split=2)
dr.fit(X_train, y_train)
preds_train = dr.predict(X_train)
preds_test = dr.predict(X_test)
roc_auc_score(y_test, preds_test)

0.7033229072349596

In [179]:
rf = RandomForestClassifier(random_state=31, n_estimators=100)

rf.fit(X_train, y_train)
preds_train = rf.predict(X_train)
preds_test = rf.predict(X_test)
roc_auc_score(y_test, preds_test)

0.7329684570290497

In [180]:
params = {'max_features': [ 4, 5, 6, 7], 'min_samples_leaf': [3, 5, 7, 9, 11], 'max_depth': [5, 10, 15]}

In [181]:
rf = RandomForestClassifier(random_state=31)

# Создание экземпляра GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=params, cv=3, verbose=3)
# Выполнение поиска по сетке
grid_search.fit(X_train, y_train)

# Получение наилучших параметров и оценки
best_params = grid_search.best_params_


# Вывод результатов
print("Наилучшие параметры:", best_params)


Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV 1/3] END max_depth=5, max_features=4, min_samples_leaf=3;, score=0.776 total time=   1.3s
[CV 2/3] END max_depth=5, max_features=4, min_samples_leaf=3;, score=0.775 total time=   1.2s
[CV 3/3] END max_depth=5, max_features=4, min_samples_leaf=3;, score=0.775 total time=   1.2s
[CV 1/3] END max_depth=5, max_features=4, min_samples_leaf=5;, score=0.776 total time=   1.2s
[CV 2/3] END max_depth=5, max_features=4, min_samples_leaf=5;, score=0.775 total time=   1.3s
[CV 3/3] END max_depth=5, max_features=4, min_samples_leaf=5;, score=0.775 total time=   1.3s
[CV 1/3] END max_depth=5, max_features=4, min_samples_leaf=7;, score=0.776 total time=   1.2s
[CV 2/3] END max_depth=5, max_features=4, min_samples_leaf=7;, score=0.775 total time=   1.2s
[CV 3/3] END max_depth=5, max_features=4, min_samples_leaf=7;, score=0.775 total time=   1.2s
[CV 1/3] END max_depth=5, max_features=4, min_samples_leaf=9;, score=0.776 total time=   1.2

In [182]:
rf = RandomForestClassifier(random_state=31, n_estimators=100, max_depth=15, max_features=7, min_samples_leaf=3)

rf.fit(X_train, y_train)
preds_train = rf.predict(X_train)
preds_test = rf.predict(X_test)
roc_auc_score(y_test, preds_test)

0.7005484843285417

In [183]:
feature_importance = rf.feature_importances_

In [184]:
feature_names = [x for x in cleaned_data if x != 'RainTomorrow']
pd.DataFrame({'feat': feature_names,
              'coef': rf.feature_importances_}).sort_values(by='coef', ascending=False)

Unnamed: 0,feat,coef
7,Humidity3pm,0.250783
2,Rainfall,0.079757
6,Humidity9am,0.070403
10,Cloud9am,0.067092
9,Pressure3pm,0.065272
...,...,...
50,Location_Newcastle,0.000000
62,Location_SalmonGums,0.000000
51,Location_Nhil,0.000000
52,Location_NorahHead,0.000000
