In [32]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [15]:
# Pull from clean_data CSV
file_path = 'Starting_Data/cleaned_data_revisedSet.csv'
df = pd.read_csv(file_path)
df

Unnamed: 0,max_ma,min_ma,genus,lng,lat,lithology1,environment,Orn0Saur1
0,155.7,132.9,Chaoyangsaurus,123.966698,42.933300,mudstone,fluvial,0
1,83.5,70.6,,-111.461914,50.753296,not reported,terrestrial,0
2,83.5,70.6,Centrosaurus,-111.528931,50.737297,not reported,terrestrial,0
3,83.5,70.6,Gryposaurus,-111.529999,50.770000,not reported,terrestrial,0
4,70.6,66.0,Ankylosaurus,-106.569901,47.637699,not reported,terrestrial,0
...,...,...,...,...,...,...,...,...
18812,70.6,66.0,,-104.427002,43.505402,claystone,floodplain,1
18813,70.6,66.0,Struthiomimus,-104.427002,43.505402,claystone,floodplain,1
18814,70.6,66.0,,-104.427002,43.505402,claystone,floodplain,1
18815,70.6,66.0,,-104.427002,43.505402,claystone,floodplain,1


In [16]:
# Use Pandas GetDummys to encode catagorical data
dummy_df = df[['lithology1','environment']]
encoded_dummy_df = pd.get_dummies(dummy_df)

# Concat the encoded dataframe to the clean dataframe and drop the environment, genus, and lithology1 columns
encoded_clean_df = pd.concat([encoded_dummy_df, df], axis=1)
encoded_clean_df = encoded_clean_df.drop(['environment', 'lithology1', 'genus', 'lithology1_not reported'], axis=1)
encoded_clean_df

Unnamed: 0,lithology1_amber,lithology1_bindstone,lithology1_breccia,lithology1_carbonate,lithology1_chalk,lithology1_chert,lithology1_claystone,lithology1_coal,lithology1_conglomerate,lithology1_diatomite,...,environment_spring,environment_submarine fan,environment_terrestrial,environment_transition zone,environment_wet floodplain,max_ma,min_ma,lng,lat,Orn0Saur1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,155.7,132.9,123.966698,42.933300,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,83.5,70.6,-111.461914,50.753296,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,83.5,70.6,-111.528931,50.737297,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,83.5,70.6,-111.529999,50.770000,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,70.6,66.0,-106.569901,47.637699,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18812,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,70.6,66.0,-104.427002,43.505402,1
18813,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,70.6,66.0,-104.427002,43.505402,1
18814,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,70.6,66.0,-104.427002,43.505402,1
18815,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,70.6,66.0,-104.427002,43.505402,1


In [17]:
# Seperate features (X) from target(y)
y = encoded_clean_df['Orn0Saur1']
X = encoded_clean_df.drop(columns='Orn0Saur1')

In [21]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [22]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [26]:
# Create a Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs', random_state=1,  max_iter=20000)
classifier

LogisticRegression(max_iter=20000, random_state=1)

In [27]:
# Train the data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=20000, random_state=1)

In [28]:
# Score the model
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6962868480725624
Testing Data Score: 0.6809776833156217


In [12]:
# Predict outcomes for test data set
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
8563,1,1
12531,1,1
7025,1,1
2012,0,0
9693,0,1
...,...,...
2722,0,0
13571,1,1
9520,1,1
6066,1,0


In [29]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.6814027630180659

In [33]:
confusion_matrix(y_test, predictions)

array([[ 568, 1068],
       [ 431, 2638]], dtype=int64)