### Load Data

In [14]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/DinukaSanjana/Water-Quality-Monitoring/refs/heads/main/water_potability_csv.csv')
df

Unnamed: 0,ph,Conductivity,Turbidity,Potability
0,3.716080,592.885359,4.500656,0
1,8.099124,418.606213,3.055934,0
2,8.316766,363.266516,4.628771,0
3,9.092223,398.410813,4.075075,0
4,5.584087,280.467916,2.559708,0
...,...,...,...,...
2780,4.668102,526.424171,4.435821,1
2781,7.808856,392.449580,2.798243,1
2782,9.419510,432.044783,3.298875,1
2783,5.126763,402.883113,4.708658,1


### Data Separation

In [15]:
y = df['Potability']
y

Unnamed: 0,Potability
0,0
1,0
2,0
3,0
4,0
...,...
2780,1
2781,1
2782,1
2783,1


In [16]:
x = df.drop('Potability', axis=1)
x

Unnamed: 0,ph,Conductivity,Turbidity
0,3.716080,592.885359,4.500656
1,8.099124,418.606213,3.055934
2,8.316766,363.266516,4.628771
3,9.092223,398.410813,4.075075
4,5.584087,280.467916,2.559708
...,...,...,...
2780,4.668102,526.424171,4.435821
2781,7.808856,392.449580,2.798243
2782,9.419510,432.044783,3.298875
2783,5.126763,402.883113,4.708658


In [17]:
y = df['Potability']
y

Unnamed: 0,Potability
0,0
1,0
2,0
3,0
4,0
...,...
2780,1
2781,1
2782,1
2783,1


In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)


In [19]:
x_test


Unnamed: 0,ph,Conductivity,Turbidity
2699,6.641174,421.417352,3.154701
2166,5.870616,427.182531,3.728785
2767,6.683368,307.725009,5.208061
442,5.608745,596.076368,4.644212
2185,5.946161,499.937502,4.903632
...,...,...,...
785,5.700785,359.506553,3.978141
2039,5.916930,428.832746,3.157560
2089,5.097786,445.562644,4.829323
709,7.440825,452.995293,2.496343


## Model Building

### Linear Regression

In [20]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

### Apply the model to make a prediction

In [21]:
y_lr_train_pred = lr.predict(x_train)
y_lr_test_pred = lr.predict(x_test)

In [22]:
print(y_lr_train_pred, y_lr_test_pred)

[0.38728287 0.36444043 0.37629183 ... 0.38319993 0.3733915  0.38511004] [0.38112021 0.37810882 0.38609047 0.35981642 0.3683589  0.37445126
 0.38400229 0.3814508  0.37459211 0.37969862 0.37725901 0.37451313
 0.39562312 0.39105654 0.37204336 0.36638689 0.39268822 0.37252007
 0.37345825 0.38468903 0.38998803 0.38823719 0.36765773 0.37883796
 0.39351564 0.39181597 0.36761359 0.37592481 0.38663895 0.37512819
 0.37444355 0.38290962 0.37428768 0.37854819 0.38432093 0.36749819
 0.38207725 0.36617908 0.38614762 0.37831744 0.380267   0.37100532
 0.37368718 0.37556253 0.38582288 0.36805423 0.38204593 0.38342508
 0.38768888 0.38762345 0.38328214 0.38445707 0.36843954 0.38437335
 0.38628306 0.39145069 0.39095694 0.38783012 0.38217572 0.38585585
 0.37656864 0.37421796 0.3743904  0.37835698 0.38926688 0.38121149
 0.38429308 0.36934427 0.3645845  0.37778668 0.3903834  0.38646598
 0.36789693 0.3835124  0.38530524 0.38838674 0.38183426 0.37387652
 0.37250306 0.38800726 0.3892453  0.37852145 0.37834537 0

In [23]:
y_lr_test_pred

array([0.38112021, 0.37810882, 0.38609047, 0.35981642, 0.3683589 ,
       0.37445126, 0.38400229, 0.3814508 , 0.37459211, 0.37969862,
       0.37725901, 0.37451313, 0.39562312, 0.39105654, 0.37204336,
       0.36638689, 0.39268822, 0.37252007, 0.37345825, 0.38468903,
       0.38998803, 0.38823719, 0.36765773, 0.37883796, 0.39351564,
       0.39181597, 0.36761359, 0.37592481, 0.38663895, 0.37512819,
       0.37444355, 0.38290962, 0.37428768, 0.37854819, 0.38432093,
       0.36749819, 0.38207725, 0.36617908, 0.38614762, 0.37831744,
       0.380267  , 0.37100532, 0.37368718, 0.37556253, 0.38582288,
       0.36805423, 0.38204593, 0.38342508, 0.38768888, 0.38762345,
       0.38328214, 0.38445707, 0.36843954, 0.38437335, 0.38628306,
       0.39145069, 0.39095694, 0.38783012, 0.38217572, 0.38585585,
       0.37656864, 0.37421796, 0.3743904 , 0.37835698, 0.38926688,
       0.38121149, 0.38429308, 0.36934427, 0.3645845 , 0.37778668,
       0.3903834 , 0.38646598, 0.36789693, 0.3835124 , 0.38530

### Evaluate model performance

In [33]:
from sklearn.metrics import mean_squared_error, r2_score

lr_train_mea = mean_squared_error(y_train, y_lr_train_pred)
lr_train_r2 = r2_score(y_train, y_lr_train_pred)

lr_test_mea = mean_squared_error(y_test, y_lr_test_pred)
lr_test_r2 = r2_score(y_test, y_lr_test_pred)

In [38]:
print('LR MSE (Train) : ' ,lr_train_mea)
print('LR R2 (Train) : ' ,lr_train_r2)
print('LR MSE (Test) : ' ,lr_test_mea)
print('LR R2 (Test) : ' ,lr_test_r2)

LR MSE (Train) :  0.23525205378758154
LR R2 (Train) :  0.00026416410235607923
LR MSE (Test) :  0.2557692254971047
LR R2 (Test) :  -0.02921071908237649


In [40]:
results = pd.DataFrame(['Linear Regression', lr_train_mea, lr_train_r2, lr_test_mea, lr_test_r2]).transpose()

In [41]:
results

Unnamed: 0,0,1,2,3,4
0,Linear Regression,0.235252,0.000264,0.255769,-0.029211
