In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv("dataset.csv")

In [3]:
dataset = dataset.drop('index',axis=1)

In [4]:
dataset.head()

In [5]:
dataset.describe()

In [6]:
dataset.info()

In [7]:
#histogram:
%matplotlib inline 
dataset.hist(bins=50,figsize=(20,15))

In [8]:
#Correlation heatmap:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15,13))
sns.heatmap(dataset.corr())
plt.show()

## features and labels:

In [9]:
feature = dataset.drop("Result", axis=1)
labels = dataset["Result"].copy()

## traing and testing:

In [10]:
from sklearn.model_selection import train_test_split
train_set , test_set = train_test_split(dataset , test_size=0.2 , random_state=42)
print(f"Rowsnin train set : {len(train_set)}\nRows in test set : {len(test_set)}\n")

## correlation matrix:

In [11]:
corr_matrix = dataset.corr()

In [12]:
corr_matrix["Result"].sort_values(ascending=False)

In [13]:
from pandas.plotting import scatter_matrix
attributes = ["Result" , "SSLfinal_State" , "URL_of_Anchor" , "Shortining_Service" , "Domain_registeration_length"]
scatter_matrix(dataset[attributes], figsize = (12,8))

## missing attributes:

In [14]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")
imputer.fit(feature)

In [15]:
imputer.statistics_

In [16]:
x = imputer.transform(feature)
feature = pd.DataFrame(x , columns = feature.columns)
feature.describe()

## splitting into x train/test and y train/test:

In [17]:
x_train = feature[:-2211]
x_test = feature[8845:]
y_train = labels[:-2211]
y_test = labels[8845:]

## model selection:

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
# model = DecisionTreeClassifier()
model = LogisticRegression()
model.fit(x_train , y_train)

In [19]:
a = model.predict(x_test)

In [20]:
a

In [21]:
list(y_test)

## Cross validation:

In [22]:
# import numpy as np
# from sklearn.model_selection import cross_val_score
# print(np.mean(cross_val_score(model , x_train , y_train , cv=5))) # it will return you the mean of our x_train and y_train.

from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(model , x_train , y_train , cv=5 ) # cannot use while working with decisiontree method="decision_function") 

In [23]:
y_pred

## calculating confusion matrix:

In [24]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train , y_pred)

## precision and recall:

In [25]:
from sklearn.metrics import precision_score , recall_score
precision_score(y_train , y_pred)

In [26]:
recall_score(y_train , y_pred)

## F1 - score:

In [27]:
from sklearn.metrics import f1_score
f1_score(y_train , y_pred)

## precision - recall curve:

In [28]:
from sklearn.metrics import precision_recall_curve
precision , recall , threshold = precision_recall_curve(y_train , y_pred)

In [29]:
precision

In [30]:
recall

In [31]:
threshold

In [32]:
import matplotlib.pyplot as plt
plt.plot(threshold , precision[:-1] , "b--", label="Precision")
plt.plot(threshold , recall[:-1] , "g-", label="Recall")
plt.xlabel("threshold")
plt.legend(loc="upper right")
plt.ylim([0,1])
plt.show()

## Saving our model:

In [33]:
from joblib import dump, load
dump(model , 'FY_proect.joblib') 