In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

In [3]:
df = pd.read_csv("train_data.csv")

In [4]:
df.drop(columns=["ID", "Unnamed: 0"],inplace=True)

In [5]:
df.describe()

Unnamed: 0,Age,Vintage,Avg_Account_Balance,y
count,221152.0,221152.0,221152.0,221152.0
mean,43.865002,46.986064,1129240.0,0.237321
std,14.825477,32.361516,853856.8,0.425442
min,23.0,7.0,20790.0,0.0
25%,30.0,20.0,604621.0,0.0
50%,43.0,32.0,895415.5,0.0
75%,54.0,74.0,1367406.0,0.0
max,85.0,135.0,10352010.0,1.0


In [6]:
df.isnull().sum()

Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         26474
Avg_Account_Balance        0
Is_Active                  0
y                          0
dtype: int64

In [7]:
le = LabelEncoder()

In [8]:
df["Gender"] = le.fit_transform(df["Gender"])
df["Region_Code"] = le.fit_transform(df["Region_Code"])
df["Is_Active"] = le.fit_transform(df["Is_Active"])
df["Occupation"] = le.fit_transform(df["Occupation"])
df['Channel_Code'] = le.fit_transform(df['Channel_Code'])

In [9]:
df['Credit_Product'] = LabelEncoder().fit_transform(df['Credit_Product'])

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221152 entries, 0 to 221151
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype
---  ------               --------------   -----
 0   Gender               221152 non-null  int32
 1   Age                  221152 non-null  int64
 2   Region_Code          221152 non-null  int32
 3   Occupation           221152 non-null  int32
 4   Channel_Code         221152 non-null  int32
 5   Vintage              221152 non-null  int64
 6   Credit_Product       221152 non-null  int32
 7   Avg_Account_Balance  221152 non-null  int64
 8   Is_Active            221152 non-null  int32
 9   y                    221152 non-null  int64
dtypes: int32(6), int64(4)
memory usage: 11.8 MB


In [11]:
df.head()

Unnamed: 0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,y
0,1,58,10,3,2,39,1,405065,0,0
1,1,28,31,2,0,32,0,560477,1,0
2,0,31,27,3,0,26,0,530190,0,0
3,1,51,34,3,0,50,1,457495,0,0
4,0,26,1,2,0,14,0,350526,0,0


In [12]:
def handle_outliers(df, columns, threshold=3):
    for col in columns:
        # Вычисляем среднее и стандартное отклонение
        mean = df[col].mean()
        std = df[col].std()
        
        # Определяем границы выбросов
        upper_limit = mean + threshold * std
        lower_limit = mean - threshold * std
        
        # Заменяем выбросы на NaN
        df[col] = np.where((df[col] > upper_limit) | (df[col] < lower_limit), np.nan, df[col])
        
        # Можно заменить NaN на медиану
        median = df[col].median()
        df[col].fillna(median, inplace=True)
        
    return df

# Список числовых столбцов для обработки выбросов
numeric_columns = ['Age', 'Vintage', 'Avg_Account_Balance']

# Обработка выбросов
df = handle_outliers(df, numeric_columns)

In [13]:
df["Credit_Product"].value_counts()

Credit_Product
0    129759
1     64919
2     26474
Name: count, dtype: int64

In [14]:
from sklearn.pipeline import Pipeline


In [15]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [16]:
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20],
    'classifier__min_samples_split': [2, 5]
}

In [17]:
X = df.copy()
y = X.pop("y")

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [19]:
scaler = StandardScaler()

# Обучение StandardScaler на тренировочных данных и преобразование тренировочных данных
X_train_scaled = scaler.fit_transform(X_train)

# Преобразование тестовых данных
X_test_scaled = scaler.transform(X_test)

In [20]:
gbc = GradientBoostingClassifier(n_estimators=300)

In [21]:
gbc.fit(X,y)

In [22]:
gbc.score(X_train,y_train)

0.8621644688872434

In [23]:
gbc.score(X_test, y_test)

0.8601433383825824

In [32]:
rfc = RandomForestClassifier(max_depth=15,random_state=42)
rfc.fit(X_train,y_train)
rfc.score(X_train,y_train)

0.8878030307312303

In [33]:
rfc.score(X_test,y_test)

0.8565485745291763

In [34]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn.score(X_train,y_train)

0.8017928906121942

In [35]:
rfc.fit(X,y)

In [36]:
knn.score(X_test,y_test)

0.7303248852614682

In [37]:
df_test = pd.read_csv("test_data.csv")

In [38]:
df_test.drop(columns=["ID", "Unnamed: 0"],inplace=True)

In [39]:
df_test["Gender"] = le.fit_transform(df_test["Gender"])
df_test["Region_Code"] = le.fit_transform(df_test["Region_Code"])
df_test["Is_Active"] = le.fit_transform(df_test["Is_Active"])
df_test["Occupation"] = le.fit_transform(df_test["Occupation"])
df_test['Channel_Code'] = le.fit_transform(df_test['Channel_Code'])
df_test['Credit_Product'] = LabelEncoder().fit_transform(df_test['Credit_Product'])

In [40]:
pd.DataFrame(rfc.predict(df_test)).to_csv("Zhunusov_a.csv")

In [41]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(rfc, X, y, cv=5, scoring='accuracy')

# Средняя точность и стандартное отклонение
mean_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()