In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
import seaborn as sns
pd.set_option('display.max_columns',100)
pd.set_option('expand_frame_repr',False)

In [2]:
df=pd.read_csv(r'D:\文件\学习\数据集\HR\data\HR.csv')
#删除缺失值
df=df.dropna()
#删除department中的错误类别sale
df=df[df['department']!='sale']

In [3]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [4]:
from sklearn.preprocessing import LabelEncoder
department_list=LabelEncoder()
dep_list=department_list.fit_transform(df[['department']])

In [5]:
df['department_encoder']=dep_list

In [6]:
dep_list={index:label for index,label in enumerate(department_list.classes_)}
dep_list

{0: 'IT',
 1: 'RandD',
 2: 'accounting',
 3: 'hr',
 4: 'management',
 5: 'marketing',
 6: 'product_mng',
 7: 'sales',
 8: 'support',
 9: 'technical'}

In [7]:
department_dummy=pd.get_dummies(df['department'],prefix='dep')

In [8]:
department_dummy

Unnamed: 0,dep_IT,dep_RandD,dep_accounting,dep_hr,dep_management,dep_marketing,dep_product_mng,dep_sales,dep_support,dep_technical
0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
14994,0,0,0,0,0,0,0,0,1,0
14995,0,0,0,0,0,0,0,0,1,0
14996,0,0,0,0,0,0,0,0,1,0
14997,0,0,0,0,0,0,0,0,1,0


In [9]:
df=pd.concat([df,department_dummy],axis=1)
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary,department_encoder,dep_IT,dep_RandD,dep_accounting,dep_hr,dep_management,dep_marketing,dep_product_mng,dep_sales,dep_support,dep_technical
0,0.38,0.53,2,157,3,0,1,0,sales,low,7,0,0,0,0,0,0,0,1,0,0
1,0.8,0.86,5,262,6,0,1,0,sales,medium,7,0,0,0,0,0,0,0,1,0,0
2,0.11,0.88,7,272,4,0,1,0,sales,medium,7,0,0,0,0,0,0,0,1,0,0
3,0.72,0.87,5,223,5,0,1,0,sales,low,7,0,0,0,0,0,0,0,1,0,0
4,0.37,0.52,2,159,3,0,1,0,sales,low,7,0,0,0,0,0,0,0,1,0,0


In [10]:
df['average_monthly_hours'].mean()

201.0503366891126

In [11]:
from sklearn.preprocessing import Binarizer
bin_=Binarizer(threshold=201)
bing=bin_.fit_transform(df[['average_monthly_hours']])

In [12]:
bing

array([[0],
       [1],
       [1],
       ...,
       [0],
       [1],
       [0]], dtype=int64)

In [13]:
df['hour_bin']=bing

In [14]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary,department_encoder,dep_IT,dep_RandD,dep_accounting,dep_hr,dep_management,dep_marketing,dep_product_mng,dep_sales,dep_support,dep_technical,hour_bin
0,0.38,0.53,2,157,3,0,1,0,sales,low,7,0,0,0,0,0,0,0,1,0,0,0
1,0.8,0.86,5,262,6,0,1,0,sales,medium,7,0,0,0,0,0,0,0,1,0,0,1
2,0.11,0.88,7,272,4,0,1,0,sales,medium,7,0,0,0,0,0,0,0,1,0,0,1
3,0.72,0.87,5,223,5,0,1,0,sales,low,7,0,0,0,0,0,0,0,1,0,0,1
4,0.37,0.52,2,159,3,0,1,0,sales,low,7,0,0,0,0,0,0,0,1,0,0,0


In [15]:
df_col=df.iloc[:,2:4]
from sklearn.preprocessing import PolynomialFeatures
num_and_hours=PolynomialFeatures(degree=2,include_bias=False)
res=num_and_hours.fit_transform(df_col)
res=pd.DataFrame(res)
res

Unnamed: 0,0,1,2,3,4
0,2.0,157.0,4.0,314.0,24649.0
1,5.0,262.0,25.0,1310.0,68644.0
2,7.0,272.0,49.0,1904.0,73984.0
3,5.0,223.0,25.0,1115.0,49729.0
4,2.0,159.0,4.0,318.0,25281.0
...,...,...,...,...,...
14994,2.0,151.0,4.0,302.0,22801.0
14995,2.0,160.0,4.0,320.0,25600.0
14996,2.0,143.0,4.0,286.0,20449.0
14997,6.0,280.0,36.0,1680.0,78400.0


In [16]:
res.columns=['num_01','num_02','num_03','num_04','num_05']

In [17]:
df=pd.concat([df,res],axis=1)
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary,department_encoder,dep_IT,dep_RandD,dep_accounting,dep_hr,dep_management,dep_marketing,dep_product_mng,dep_sales,dep_support,dep_technical,hour_bin,num_01,num_02,num_03,num_04,num_05
0,0.38,0.53,2,157,3,0,1,0,sales,low,7,0,0,0,0,0,0,0,1,0,0,0,2.0,157.0,4.0,314.0,24649.0
1,0.8,0.86,5,262,6,0,1,0,sales,medium,7,0,0,0,0,0,0,0,1,0,0,1,5.0,262.0,25.0,1310.0,68644.0
2,0.11,0.88,7,272,4,0,1,0,sales,medium,7,0,0,0,0,0,0,0,1,0,0,1,7.0,272.0,49.0,1904.0,73984.0
3,0.72,0.87,5,223,5,0,1,0,sales,low,7,0,0,0,0,0,0,0,1,0,0,1,5.0,223.0,25.0,1115.0,49729.0
4,0.37,0.52,2,159,3,0,1,0,sales,low,7,0,0,0,0,0,0,0,1,0,0,0,2.0,159.0,4.0,318.0,25281.0


In [18]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary,department_encoder,dep_IT,dep_RandD,dep_accounting,dep_hr,dep_management,dep_marketing,dep_product_mng,dep_sales,dep_support,dep_technical,hour_bin,num_01,num_02,num_03,num_04,num_05
0,0.38,0.53,2,157,3,0,1,0,sales,low,7,0,0,0,0,0,0,0,1,0,0,0,2.0,157.0,4.0,314.0,24649.0
1,0.8,0.86,5,262,6,0,1,0,sales,medium,7,0,0,0,0,0,0,0,1,0,0,1,5.0,262.0,25.0,1310.0,68644.0
2,0.11,0.88,7,272,4,0,1,0,sales,medium,7,0,0,0,0,0,0,0,1,0,0,1,7.0,272.0,49.0,1904.0,73984.0
3,0.72,0.87,5,223,5,0,1,0,sales,low,7,0,0,0,0,0,0,0,1,0,0,1,5.0,223.0,25.0,1115.0,49729.0
4,0.37,0.52,2,159,3,0,1,0,sales,low,7,0,0,0,0,0,0,0,1,0,0,0,2.0,159.0,4.0,318.0,25281.0


In [19]:
train_data=pd.read_csv(r"D:\文件\学习\数据集\二手车交易\trainDataAfterCleaning.csv",index_col=False)

In [20]:
from datetime import datetime

In [21]:
train_data.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14,price
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,0.0,1046,0,0,20160404,43.357796,3.966344,0.050257,2.159744,1.143786,0.235676,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762,1850
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,0.0,4366,0,0,20160309,45.305273,5.236112,0.137925,1.380657,-1.422165,0.264777,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522,3600
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,0.0,2806,0,0,20160402,45.978359,4.823792,1.319524,-0.998467,-0.996911,0.25141,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.56533,-0.832687,-0.229963,6222
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,0.0,434,0,0,20160312,45.687478,4.492574,-0.050616,0.8836,-2.228079,0.274293,0.1103,0.121964,0.033395,0.0,-4.509599,1.28594,-0.501868,-2.438353,-0.478699,2400
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,0.0,6977,0,0,20160313,44.383511,2.031433,0.572169,-1.571239,2.246088,0.228036,0.073205,0.09188,0.078819,0.121534,-1.89624,0.910783,0.93111,2.834518,1.923482,5200


In [22]:
year=pd.DataFrame([str(x)[:4] for x in train_data['regDate']],columns=['year'])
month=pd.DataFrame([str(x)[4:6] for x in train_data['regDate']],columns=['month'])
day=pd.DataFrame([str(x)[6:] for x in train_data['regDate']],columns=['day'])

In [23]:
month['month']=month['month'].replace('00','03')

In [24]:
train_data=pd.concat([train_data,year,month,day],axis=1)

In [25]:
train_data=train_data.astype({'year':'int','month':'int','day':'int'})

In [26]:
week=[datetime(x,y,z) for x,y,z in zip(train_data['year'],train_data['month'],train_data['day'])]

In [27]:
week=pd.DataFrame(week,columns=['datetime'])
train_data=pd.concat([train_data,week],axis=1)

In [28]:
train_data['week']=train_data['datetime'].apply(lambda x:x.week)

In [29]:
train_data['quarter']=train_data['datetime'].apply(lambda x:x.quarter)

In [30]:
train_data['dayofyear']=train_data['datetime'].apply(lambda x:x.dayofyear)

In [31]:
train_data.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14,price,year,month,day,datetime,week,quarter,dayofyear
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,0.0,1046,0,0,20160404,43.357796,3.966344,0.050257,2.159744,1.143786,0.235676,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762,1850,2004,4,2,2004-04-02,14,2,93
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,0.0,4366,0,0,20160309,45.305273,5.236112,0.137925,1.380657,-1.422165,0.264777,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522,3600,2003,3,1,2003-03-01,9,1,60
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,0.0,2806,0,0,20160402,45.978359,4.823792,1.319524,-0.998467,-0.996911,0.25141,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.56533,-0.832687,-0.229963,6222,2004,4,3,2004-04-03,14,2,94
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,0.0,434,0,0,20160312,45.687478,4.492574,-0.050616,0.8836,-2.228079,0.274293,0.1103,0.121964,0.033395,0.0,-4.509599,1.28594,-0.501868,-2.438353,-0.478699,2400,1996,9,8,1996-09-08,36,3,252
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,0.0,6977,0,0,20160313,44.383511,2.031433,0.572169,-1.571239,2.246088,0.228036,0.073205,0.09188,0.078819,0.121534,-1.89624,0.910783,0.93111,2.834518,1.923482,5200,2012,1,3,2012-01-03,1,1,3


In [32]:
corpus=[
    'The sky is blue and beautiful',
    'Love this blue and beautiful sky',
    'The quick brown fox jumps over the lazy dog',
    'The brown fox is quick and the blue dog is lazy',
    'The sky is very blue and the sky is very beautiful today',
    'The dog is lazy but the brown fox is quick'
]
labels=['weather','weather','animals','animals','weather','animals']

In [33]:
corpus=pd.DataFrame(corpus,columns=['text'])

In [34]:
corpus['label']=labels

In [35]:
corpus

Unnamed: 0,text,label
0,The sky is blue and beautiful,weather
1,Love this blue and beautiful sky,weather
2,The quick brown fox jumps over the lazy dog,animals
3,The brown fox is quick and the blue dog is lazy,animals
4,The sky is very blue and the sky is very beaut...,weather
5,The dog is lazy but the brown fox is quick,animals


In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [41]:
tf=TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,ngram_range=(1,1),min_df=0.,max_df=1.,max_features=100)
corpus_list=tf.fit_transform(corpus['text'])

In [42]:
corpus_list

<6x11 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

In [43]:
corpus_list=corpus_list.toarray()

In [44]:
corpus_list

array([[0.60474937, 0.51822427, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.60474937,
        0.        ],
       [0.45545397, 0.39028945, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.65787347, 0.        , 0.45545397,
        0.        ],
       [0.        , 0.        , 0.3756535 , 0.3756535 , 0.3756535 ,
        0.5426069 , 0.3756535 , 0.        , 0.3756535 , 0.        ,
        0.        ],
       [0.        , 0.35785031, 0.41759865, 0.41759865, 0.41759865,
        0.        , 0.41759865, 0.        , 0.41759865, 0.        ,
        0.        ],
       [0.35758304, 0.30642149, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.71516607,
        0.5165053 ],
       [0.        , 0.        , 0.4472136 , 0.4472136 , 0.4472136 ,
        0.        , 0.4472136 , 0.        , 0.4472136 , 0.        ,
        0.        ]])

In [45]:
col_name=tf.get_feature_names()

In [46]:
col_name

['beautiful',
 'blue',
 'brown',
 'dog',
 'fox',
 'jumps',
 'lazy',
 'love',
 'quick',
 'sky',
 'today']

In [47]:
corpus_list=pd.DataFrame(corpus_list,columns=col_name)

In [48]:
corpus_list

Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,lazy,love,quick,sky,today
0,0.604749,0.518224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.604749,0.0
1,0.455454,0.390289,0.0,0.0,0.0,0.0,0.0,0.657873,0.0,0.455454,0.0
2,0.0,0.0,0.375653,0.375653,0.375653,0.542607,0.375653,0.0,0.375653,0.0,0.0
3,0.0,0.35785,0.417599,0.417599,0.417599,0.0,0.417599,0.0,0.417599,0.0,0.0
4,0.357583,0.306421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.715166,0.516505
5,0.0,0.0,0.447214,0.447214,0.447214,0.0,0.447214,0.0,0.447214,0.0,0.0
