In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Load the data
data = pd.read_csv('Matches.csv')

# Display the first few rows of the data
data.head()


Unnamed: 0,id,city,date,player_of_match,venue,neutral_venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,eliminator,method,umpire1,umpire2
0,335982,Bangalore,2008-04-18,BB McCullum,M Chinnaswamy Stadium,0,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,N,,Asad Rauf,RE Koertzen
1,335983,Chandigarh,2008-04-19,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",0,Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,N,,MR Benson,SL Shastri
2,335984,Delhi,2008-04-19,MF Maharoof,Feroz Shah Kotla,0,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0,N,,Aleem Dar,GA Pratapkumar
3,335985,Mumbai,2008-04-20,MV Boucher,Wankhede Stadium,0,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,N,,SJ Davis,DJ Harper
4,335986,Kolkata,2008-04-20,DJ Hussey,Eden Gardens,0,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5.0,N,,BF Bowden,K Hariharan


In [12]:
# Select relevant features
features = ['city', 'neutral_venue', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner']
target = 'result_margin'

# One-hot encode categorical variables
data_encoded = pd.get_dummies(data[features])

# Add the target variable
data_encoded[target] = data[target]

# Display the first few rows of the encoded data
data_encoded.head()


Unnamed: 0,neutral_venue,city_Abu Dhabi,city_Ahmedabad,city_Bangalore,city_Bengaluru,city_Bloemfontein,city_Cape Town,city_Centurion,city_Chandigarh,city_Chennai,...,winner_Kochi Tuskers Kerala,winner_Kolkata Knight Riders,winner_Mumbai Indians,winner_Pune Warriors,winner_Rajasthan Royals,winner_Rising Pune Supergiant,winner_Rising Pune Supergiants,winner_Royal Challengers Bangalore,winner_Sunrisers Hyderabad,result_margin
0,0,False,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,140.0
1,0,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,33.0
2,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,9.0
3,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,5.0
4,0,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,5.0


In [17]:
# Check for missing values in the target variable
#print(data[target].isnull().sum())

# Since the dataset is small, we'll inspect the entire dataset for any missing values
#print(data.isnull().sum())

# Drop rows with missing target values
data = data.dropna(subset=[target])

# One-hot encode categorical variables again as the data might have changed
data_encoded = pd.get_dummies(data[features])

# Add the target variable back
data_encoded[target] = data[target]

# Split the data into training and testing sets
X = data_encoded.drop(columns=[target])
y = data_encoded[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the R-squared score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

# Print the metrics
print("R-squared score:", r2)
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)

R-squared score: -0.2564885195282989
Mean Absolute Error (MAE): 18.1
Mean Squared Error (MSE): 747.0953125
Root Mean Squared Error (RMSE): 27.333044332821764
