In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import seaborn as sns
from pathlib import Path

In [3]:
#import data
#dropna() due to missing data in warp and weft shrinkage
table_train=pd.read_excel("Shrinkage Data for HKU updated 5_4.xlsx").dropna()
drop_cols_train=[
            'gf_id\n品种ID',
            'gf_no\n品种号',
            'PPO_no\nPPO号',
            'fabric_no\n布号',
            'send_no\n送布单号',
            'test_date\n测试时间',
            'warp_shrinkage\n经向缩率',
            'weft_shrinkage\n纬向缩率'
            ]
table_train_X= table_train.drop(drop_cols_train,axis=1)
drop_cols_prod=['NO']
table_prod_X=pd.read_excel("Data+Test+of+shrinkage+for+HKU.xlsx").drop(drop_cols_prod,axis=1)
raw_data=pd.concat([table_train_X,table_prod_X], axis=0)
raw_data.head()

Unnamed: 0,wash_method\n洗水方法,Fabric_Type\n织法,Warp_Type\n经纱纱型,Weft_Type\n纬纱纱型,Warp_Count\n经纱支,Weft_Count\n纬纱支,Warp_Density\n经纱密度,Weft_Density\n纬纱密度,Finishing_Method\n整理方法,GEW STD warp\nGEW经向缩率审单标准,GEW STD weft\nGEW纬向缩率审单标准,Mercerizing\n丝光碱浓
0,GW,P,CF+JC,CF+JC,50+100/2,50,120,100,1+,2.0,2.0,24BE
1,GW,P,CF+JC,CF+JC,50+100/2,50,120,100,1+,2.0,2.0,24BE
2,GW,O,JC+CF,JC,40+80/2,20,110,54,1+,2.0,2.0,18BE
3,GW,C,JC+CF,JC+CF,50+100/2,50,150,126,1+,2.0,2.0,24BE
4,GW,C,JC,JC,50,50,150,126,1+,2.0,2.0,24BE


In [4]:
print('Number of row and columns in train+test set: ', table_train_X.shape)
print('Number of row and columns in production set: ', table_prod_X.shape)
print('Total Number of row and columns', raw_data.shape)

Number of row and columns in train+test set:  (113357, 12)
Number of row and columns in production set:  (200, 12)
Total Number of row and columns (113557, 12)


In [5]:
#clean the X
#fabric_type to int
Wash_Method_dummpy = pd.get_dummies(raw_data['wash_method\n洗水方法'],drop_first=False,prefix='wash_method')
Fabric_Type_dummpy = pd.get_dummies(raw_data['Fabric_Type\n织法'],drop_first=False,prefix='Fabric_Type')
Warp_Type_dummpy = pd.get_dummies(raw_data['Warp_Type\n经纱纱型'],drop_first=False,prefix='Warp_Type')
Weft_Type_dummpy = pd.get_dummies(raw_data['Weft_Type\n纬纱纱型'],drop_first=False,prefix='Weft_Type')
Finishing_Method_dummpy = pd.get_dummies(raw_data['Finishing_Method\n整理方法'],drop_first=False,prefix='Finishing_Method')
Mercerizing_dummpy = pd.get_dummies(raw_data['Mercerizing\n丝光碱浓'],drop_first=False,prefix='Mercerizing')
Warp_Count_dummpy = pd.get_dummies(raw_data['Warp_Count\n经纱支'],drop_first=False,prefix='Warp_Count')
Weft_Count_dummpy = pd.get_dummies(raw_data['Weft_Count\n纬纱支'],drop_first=False,prefix='Weft_Count')
dummpy_data = pd.concat([Wash_Method_dummpy, Fabric_Type_dummpy, Warp_Type_dummpy, Weft_Type_dummpy, Finishing_Method_dummpy, Mercerizing_dummpy, Mercerizing_dummpy, Warp_Count_dummpy], axis =1)
dummpy_data.head()

Unnamed: 0,wash_method_ABGA,wash_method_AGWA,wash_method_BGW,wash_method_BIP,wash_method_BSR,wash_method_CGK,wash_method_CHW,wash_method_COWA,wash_method_CSD,wash_method_CSW,...,Warp_Count_80+140/2+80\20D/2,Warp_Count_60+30/2,Warp_Count_60+40\40D/2,Warp_Count_50+80\20D/2,Warp_Count_50+100/2+80\20D/2,Warp_Count_80+80\20D/2,Warp_Count_40+40\40D/2,Warp_Count_40+50/2+80/2,Warp_Count_60+80\20D/80,Warp_Count_60+80\20D/80+120/2
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
drop_cols_convert=['wash_method\n洗水方法','Fabric_Type\n织法','Warp_Type\n经纱纱型','Weft_Type\n纬纱纱型','Finishing_Method\n整理方法','Warp_Count\n经纱支','Weft_Count\n纬纱支','Mercerizing\n丝光碱浓']
data = pd.concat([raw_data.drop(drop_cols_convert,axis=1),dummpy_data],axis =1)
data.head()

Unnamed: 0,Warp_Density\n经纱密度,Weft_Density\n纬纱密度,GEW STD warp\nGEW经向缩率审单标准,GEW STD weft\nGEW纬向缩率审单标准,wash_method_ABGA,wash_method_AGWA,wash_method_BGW,wash_method_BIP,wash_method_BSR,wash_method_CGK,...,Warp_Count_80+140/2+80\20D/2,Warp_Count_60+30/2,Warp_Count_60+40\40D/2,Warp_Count_50+80\20D/2,Warp_Count_50+100/2+80\20D/2,Warp_Count_80+80\20D/2,Warp_Count_40+40\40D/2,Warp_Count_40+50/2+80/2,Warp_Count_60+80\20D/80,Warp_Count_60+80\20D/80+120/2
0,120,100,2.0,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,120,100,2.0,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,110,54,2.0,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,150,126,2.0,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,150,126,2.0,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# clean the Y
warp_shrinkage = table_train['warp_shrinkage\n经向缩率'].astype(str).str.extract(r'(-?\d+.\d+)').astype('float')
warp_shrinkage.head()
weft_shrinkage = table_train['weft_shrinkage\n纬向缩率'].astype(str).str.extract(r'(-?\d+.?\d*)').astype('float')
weft_shrinkage.head()

Unnamed: 0,0
0,-0.6
1,-0.87
2,-1.37
3,-1.13
4,-0.6


In [10]:
#split the train dataset and test dataset
X = data.head(table_train_X.shape[0]).values
Y = pd.concat([warp_shrinkage,weft_shrinkage], axis=1).values

Y_warp= Y[:, 0]
Y_weft= Y[:, 1]
print('Number of row and columns in X set:', X.shape)
print('Number of row and columns in Y set for warp:', Y_warp.shape)
print('Number of row and columns in Y set for weft:', Y_weft.shape)

Number of row and columns in X set: (113357, 1175)
Number of row and columns in Y set for warp: (113357,)
Number of row and columns in Y set for weft: (113357,)


In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np

In [13]:
sc_X = StandardScaler()
X_train=sc_X.fit_transform(X) #scale the independent variables
y_train_warp=Y_warp # scaling is not required for dependent variable
y_train_weft=Y_weft


In [14]:
X_prod = data.tail(table_prod_X.shape[0]).values

In [15]:
X_test=sc_X.transform(X_prod)

In [None]:
reg_warp = LinearRegression().fit(X_train, y_train_warp)
print("The Linear regression score on warp training data is ", round(reg.score(X_train, y_train_warp),2))

In [19]:
reg_weft = LinearRegression().fit(X_train, y_train_weft)
print("The Linear regression score on weft training data is ", round(reg.score(X_train, y_train_weft),2))

The Linear regression score on weft training data is  0.46


In [20]:
predict_warp=reg_warp.predict(X_test)
predict_weft=reg_weft.predict(X_test)

In [21]:
Y_prod_pred_df = pd.DataFrame(list(zip(predict_warp, predict_weft)), columns=['warp_shrinkage\n经向缩率', 'weft_shrinkage\n纬向缩率'])
Y_prod_pred_df.to_excel("shrinkage_prediction_output_0629.xlsx", index = False, sheet_name='shrinkage prediction')