In [22]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,ElasticNet,Lasso
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

In [3]:
df=pd.read_csv('all.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17629 entries, 0 to 17628
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0    Plocs      17629 non-null  int64  
 1    MPlocs     17629 non-null  int64  
 2    Tlocs      17629 non-null  int64  
 3    MTlocs     17629 non-null  int64  
 4    Classes    17629 non-null  int64  
 5    Aclasses   17629 non-null  int64  
 6    DClasses   17629 non-null  int64  
 7    MClasses   17629 non-null  int64  
 8    Methods    17629 non-null  int64  
 9    AMethods   17629 non-null  int64  
 10   DMethods   17629 non-null  int64  
 11   MMethods   17629 non-null  int64  
 12   CMMethods  17629 non-null  int64  
 13  tool        17629 non-null  object 
 14   TLR        17629 non-null  float64
 15   MTRL       17629 non-null  float64
 16   MRTL       17629 non-null  float64
 17   TMR        17629 non-null  float64
 18   MCR        17629 non-null  float64
 19   MMR        17629 non-nul

In [4]:
df.isna().sum()

 Plocs        0
 MPlocs       0
 Tlocs        0
 MTlocs       0
 Classes      0
 Aclasses     0
 DClasses     0
 MClasses     0
 Methods      0
 AMethods     0
 DMethods     0
 MMethods     0
 CMMethods    0
tool          0
 TLR          0
 MTRL         0
 MRTL         0
 TMR          0
 MCR          0
 MMR          0
 RFCR         0
 FCR          0
dtype: int64

In [5]:
tool=pd.get_dummies(df['tool'])
tool.head()

Unnamed: 0,espresso,robolectric,robotium,uiautomator
0,False,False,False,True
1,False,False,False,True
2,False,False,False,True
3,False,False,False,True
4,False,False,False,True


In [6]:
tool=tool.astype(int)
tool.head()

Unnamed: 0,espresso,robolectric,robotium,uiautomator
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [7]:
X=df.iloc[:,0:13].join(tool)
X.head()

Unnamed: 0,Plocs,MPlocs,Tlocs,MTlocs,Classes,Aclasses,DClasses,MClasses,Methods,AMethods,DMethods,MMethods,CMMethods,espresso,robolectric,robotium,uiautomator
0,1897,-1,471,-1,1,-1,-1,-1,17,-1,-1,-1,-1,0,0,0,1
1,1895,4,471,0,1,0,0,0,17,0,0,0,0,0,0,0,1
2,1899,58,471,6,1,0,0,1,17,0,0,3,1,0,0,0,1
3,1938,81,471,0,1,0,0,0,17,0,0,0,0,0,0,0,1
4,1938,0,471,0,1,0,0,0,17,0,0,0,0,0,0,0,1


In [8]:
for i in X:
    X[i]=X[i].astype('float64')

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17629 entries, 0 to 17628
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0    Plocs       17629 non-null  float64
 1    MPlocs      17629 non-null  float64
 2    Tlocs       17629 non-null  float64
 3    MTlocs      17629 non-null  float64
 4    Classes     17629 non-null  float64
 5    Aclasses    17629 non-null  float64
 6    DClasses    17629 non-null  float64
 7    MClasses    17629 non-null  float64
 8    Methods     17629 non-null  float64
 9    AMethods    17629 non-null  float64
 10   DMethods    17629 non-null  float64
 11   MMethods    17629 non-null  float64
 12   CMMethods   17629 non-null  float64
 13  espresso     17629 non-null  float64
 14  robolectric  17629 non-null  float64
 15  robotium     17629 non-null  float64
 16  uiautomator  17629 non-null  float64
dtypes: float64(17)
memory usage: 2.3 MB


In [10]:
Y=df.iloc[:,14:22]
Y.head()

Unnamed: 0,TLR,MTRL,MRTL,TMR,MCR,MMR,RFCR,FCR
0,0.248287,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,0.248549,0.0,0.0,0.0,0.0,0.0,-1.0,0.0
2,0.248025,0.012739,0.103448,0.416209,1.0,0.176471,1.0,1.0
3,0.243034,0.0,0.0,0.0,0.0,0.0,-1.0,0.0
4,0.243034,0.0,-1.0,-1.0,0.0,0.0,-1.0,0.0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

In [12]:
dt=DecisionTreeRegressor(max_depth=8)
dt.fit(X_train,y_train)
y_pred=dt.predict(X_test)

In [13]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'The mean_absolute_error {mae}')
print(f'The mean_squared_error {mse}')
print(f'The r2_score {r2}')

The mean_absolute_error 0.053307516998531784
The mean_squared_error 0.5144271983601069
The r2_score 0.8366616567502807


In [14]:
dt=RandomForestRegressor(max_depth=8)
dt.fit(X_train,y_train)
y_pred=dt.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'The mean_absolute_error {mae}')
print(f'The mean_squared_error {mse}')
print(f'The r2_score {r2}')

The mean_absolute_error 0.04902587072660272
The mean_squared_error 0.3956769189326229
The r2_score 0.8729415500121698


In [15]:
dt=LinearRegression()
dt.fit(X_train,y_train)
y_pred=dt.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'The mean_absolute_error {mae}')
print(f'The mean_squared_error {mse}')
print(f'The r2_score {r2}')

The mean_absolute_error 0.425027917759365
The mean_squared_error 0.8813956124579717
The r2_score 0.19209016230684403


In [16]:
dt=ElasticNet()
dt.fit(X_train,y_train)
y_pred=dt.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'The mean_absolute_error {mae}')
print(f'The mean_squared_error {mse}')
print(f'The r2_score {r2}')

The mean_absolute_error 0.4469299028034617
The mean_squared_error 0.9099146721673808
The r2_score 0.12186019916752434


In [17]:
dt=XGBRegressor()
dt.fit(X_train,y_train)
y_pred=dt.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'The mean_absolute_error {mae}')
print(f'The mean_squared_error {mse}')
print(f'The r2_score {r2}')


The mean_absolute_error 0.023994790821656488
The mean_squared_error 0.21206177766864898
The r2_score 0.9575178602971579


In [18]:
dt.save_model("xgbreg.json")

In [19]:
X.to_csv('X.csv')

In [21]:
dt=Lasso()
dt.fit(X_train,y_train)
y_pred=dt.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'The mean_absolute_error {mae}')
print(f'The mean_squared_error {mse}')
print(f'The r2_score {r2}')

The mean_absolute_error 0.4470604829743266
The mean_squared_error 0.9106863377028751
The r2_score 0.11937058448764909
