## 肿瘤预测案例

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
# 1.获取数据
names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape','Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']

data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",names=names)
data.head()

In [3]:
data.info()

In [4]:
data['Bare Nuclei'].unique()

In [5]:
# 数据处理
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",names=names,na_values='?')

In [6]:
data.info()

In [7]:
data.dropna(inplace = True)

In [8]:
data.info()

In [13]:
# 确定特征值目标值
x = data.iloc[:,1:10]
y =data['Class']
# 训练集测试集划分
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=22)
#特征工程 标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 创建逻辑回归模型
estimator = LogisticRegression()
#模型训练
estimator.fit(x_train,y_train)
#模型评估
estimator.score(x_test,y_test)

## 电信流失用户预测

In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
churn=pd.read_csv('data/churn.csv')
churn.info()

In [16]:
churn.head()

In [18]:
churn = pd.get_dummies(churn)

In [19]:
churn.drop(['Churn_No','gender_Male'],axis=1,inplace=True)
churn.head()

In [21]:
churn = churn.rename(columns={'Churn_Yes':'flagb'})
churn.columns

In [23]:
churn.flag.value_counts(1)

In [24]:
churn.groupby('flag').mean()

In [25]:
sns.countplot(y='Contract_Month',hue='flag',data=churn)

In [26]:
y = churn['flag']
x = churn[['Contract_Month','internet_other','PaymentElectronic']]

In [37]:
# 训练集测试集划分
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=100)
# 模型训练
lr = LogisticRegression()
lr.fit(x_train,y_train)
# 模型评估
y_pred = lr.predict(x_test)
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
accuracy_score(y_test,y_pred)
# 参数调整

In [39]:
roc_auc_score(y_test,y_pred)

In [51]:
lr = LogisticRegression(class_weight='balanced')
lr.fit(x_train,y_train)
# 模型评估
y_pred = lr.predict(x_test)
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
accuracy_score(y_test,y_pred)

In [52]:
lr.coef_

In [41]:
roc_auc_score(y_test,y_pred)

In [42]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
kfold = StratifiedKFold(n_splits=5, shuffle=True)
lr = LogisticRegression()
param_grid = {'solver': ['newton-cg', 'lbfgs', 'liblinear'],
              'C': [0.001, 0.01, 1, 10, 100],'class_weight':['balanced']}
search = GridSearchCV(lr, param_grid, cv=kfold)
lr = search.fit(x_train, y_train)

In [43]:
lr.best_estimator_

In [44]:
lr.best_params_

In [45]:
lr.score(x_test,y_test)