In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline

In [None]:
df = pd.read_csv("20-digitalskysurvey.csv")

In [None]:
df.head()

In [None]:
columns_to_drop = ["objid", "specobjid", "run", "rerun", "camcol", "field"]

In [None]:
df.drop(columns_to_drop, axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df["class"].value_counts()

In [None]:
sns.scatterplot(data = df, x = "redshift", y = "ra", hue = "class")
plt.show()

In [None]:
sns.scatterplot(data = df, x = "redshift", y = "dec", hue = "class")
plt.show()

In [None]:
sns.scatterplot(data = df, x = "redshift", y = "plate", hue = "class")
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["class"] = le.fit_transform(df["class"])

In [None]:
df.head()

In [None]:
df.corr()

In [None]:
sns.pairplot(df, hue = "class")
plt.show()

In [None]:
fig,axes = plt.subplots(nrows = 1, ncols = 3, figsize = (16,4))
ax = sns.histplot(df[df["class"] == 2].redshift, ax = axes[0])
ax.set_title("Star")
ax = sns.histplot(df[df["class"] == 0].redshift, ax = axes[1])
ax.set_title("Galaxy")
ax = sns.histplot(df[df["class"] == 1].redshift, ax = axes[2])
ax.set_title("QSO")
plt.show()

In [None]:
X = df.drop("class", axis = 1)
y = df["class"]
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=15)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
!pip install xgboost

In [None]:
from xgboost import XGBClassifier 
xgb = XGBClassifier(n_estimators = 100)
xgb.fit(X_train, y_train)

In [None]:
y_pred = xgb.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print("confusion matrix: ", confusion_matrix(y_test,y_pred))
print("accuracy score: ", accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
params = {
    "n_estimators": [100,200,300,500], 
    "learning_rate": [0.01, 0.1], 
    "max_depth": [5,8,12,20,30], 
    "colsample_bytree": [0.3, 0.4, 0.5,0.8,1]
}

In [None]:
from sklearn.model_selection import GridSearchCV 
grid = GridSearchCV(estimator=XGBClassifier(), param_grid=params, cv = 5, n_jobs=-1)

In [None]:
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
print("confusion matrix: ", confusion_matrix(y_test,y_pred))
print("accuracy score: ", accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))