In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

In [None]:
# read data
df = pd.read_csv('dielectron.csv')
df.head(5)

In [None]:
df.info()

In [None]:
# check null values
df.isnull().sum()

In [None]:
# delete null values
df = df.dropna(axis=0)
df.isnull().sum()

In [None]:
# check for duplicates
df.duplicated().sum()

In [None]:
# delete duplicates
df = df.drop_duplicates()
df.duplicated().sum()

In [None]:
# plot distribution of mass
plt.figure(figsize=(6,6))
df.M.plot(kind='hist', bins=200, color='blue')
plt.title('Distribution of M')
plt.xlabel('M - mass')
plt.grid()
plt.show()

In [None]:
#detect outliers for px1,px2,py1,py2,pz1,pz2 us boxplot
plt.figure(figsize=(10,10))
plt.subplot(3,2,1)
sns.boxplot(x=df['px1 '])
plt.subplot(3,2,2)
sns.boxplot(x=df['px2'])
plt.subplot(3,2,3)
sns.boxplot(x=df['py1'])
plt.subplot(3,2,4)
sns.boxplot(x=df['py2'])
plt.subplot(3,2,5)
sns.boxplot(x=df['pz1'])
plt.subplot(3,2,6)
sns.boxplot(x=df['pz2'])
plt.show()

In [None]:
#create a scatte plot between M and each feature
plt.figure(figsize=(20,20))
for i in range(0, features.shape[1]):
    plt.subplot(5, 5, i+1)
    plt.scatter(features[:,i], labels, color='blue', s=1)
    plt.xlabel(df.columns[i])
    plt.ylabel('M')
plt.show()


In [None]:
# plot a correlation matrix
corr = df.corr()
fig = plt.figure(figsize = (20,20))
sns.heatmap(corr, annot=True, cmap='RdYlGn',vmin=-1, vmax=1)
plt.show()

In [None]:
# generate data
features = df.drop(['M', 'pt1', 'pt2', 'E1', 'E2','Q1', 'Q2', 'phi1', 'phi2', 'eta1', 'eta2'], axis=1).to_numpy()
labels = df.M.to_numpy()

In [None]:
# split data into train, test and validation sets
X_train, X, y_train, y = train_test_split(features, labels, test_size=0.2,train_size=0.8)
X_val, X_test, y_val, y_test = train_test_split(X,y,test_size = 0.5,train_size =0.5)
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape

In [None]:
# standardize data
X_test_std = StandardScaler().fit_transform(X_test)
X_train_std = StandardScaler().fit_transform(X_train)
X_val_std = StandardScaler().fit_transform(X_val)

np.allclose(X_train.mean(axis=0), np.zeros(X_train.shape[1])),\
    np.allclose(X_train_std.mean(axis=0),np.zeros(X_train_std.shape[1])),\
        np.allclose(X_train.std(axis=0), np.ones(X_train.shape[1])),\
            np.allclose(X_train_std.std(axis=0), np.ones(X_train_std.shape[1]))

In [None]:
# Random Forest Regression
clf = RandomForestRegressor(n_estimators=100)
clf.fit(X_train_std, y_train)
print("feature importancy", clf.feature_importances_)
y_pred = clf.predict(X_train_std)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print("training error", rmse)
y_pred = clf.predict(X_test_std)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("test error", rmse)

In [None]:
# compare predicted and actual values
figure = plt.figure(figsize=(20,10))
plt.scatter(range(y_pred.shape[0]), y_pred, c="red")
plt.scatter(range(y_pred.shape[0]), y_test, c="blue")
plt.show()

In [None]:
# show distribution of predicted and actual values
fig, axs = plt.subplots(1, 2, figsize = (15,5))
axs[0].hist(y_pred, bins=100)
axs[0].axvline(y_pred.mean(), color='k', linestyle='dashed', linewidth=1)
axs[1].hist(y_test, bins=100)
axs[1].axvline(y_test.mean(), color='k', linestyle='dashed', linewidth=1)
plt.show()            


In [None]:
# Decision Tree Regression
clf = tree.DecisionTreeRegressor()
clf.fit(X_train_std, y_train)
y_pred = clf.predict(X_train_std)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print(rmse)
y_pred = clf.predict(X_test_std)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

In [None]:
# Neural Network Regression
regr = MLPRegressor(max_iter=1000, learning_rate="adaptive").fit(X_train, y_train)
y_pred = regr.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print(rmse)
y_pred = regr.predict(X_test_std)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

In [None]:
# Polynomial Regression
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X_train_std, y_train)
clf = LinearRegression(fit_intercept=False)
clf.fit(X_poly, y_train)
y_pred = clf.predict(X_poly)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print(rmse)
#y_pred = clf.predict(X_poly)
#rmse = np.sqrt(mean_squared_error(y_test, y_pred))
#rmse
figure = plt.figure(figsize=(20,20))
plt.scatter(range(y_pred.shape[0]), y_pred, c="red")
plt.scatter(range(y_train.shape[0]), y_train, c="blue")
plt.show()

In [None]:
# show distribution of predicted and actual values
fig, axs = plt.subplots(1, 2, figsize = (15,5))
axs[0].hist(y_pred, bins=100)
axs[0].axvline(y_pred.mean(), color='k', linestyle='dashed', linewidth=1)
axs[1].hist(y_test, bins=100)
axs[1].axvline(y_test.mean(), color='k', linestyle='dashed', linewidth=1)
plt.show()  