In [15]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [3]:
# Pull from clean_data CSV
file_path = 'Starting_Data/Cleaned_Data.csv'
df = pd.read_csv(file_path)
df

Unnamed: 0,max_ma,min_ma,genus,lng,lat,lithology1,environment,THY0orTHE1
0,83.5,70.6,Gorgosaurus,-111.528732,50.740726,sandstone,channel,1
1,83.5,70.6,Gorgosaurus,-111.549347,50.737015,sandstone,channel,1
2,83.5,70.6,Gorgosaurus,-111.564636,50.723866,sandstone,channel lag,1
3,83.5,70.6,Gorgosaurus,-111.525337,50.740471,not reported,terrestrial,1
4,83.5,70.6,Gorgosaurus,-111.601021,50.786999,sandstone,channel lag,1
...,...,...,...,...,...,...,...,...
3393,237.0,201.3,Grallator,-109.569000,38.602001,not reported,terrestrial,1
3394,83.5,70.6,Panoplosaurus,-111.564636,50.723866,not reported,terrestrial,0
3395,83.5,70.6,Richardoestesia,-108.320999,36.507999,not reported,terrestrial,1
3396,83.5,70.6,Richardoestesia,-108.320602,36.508499,sandstone,terrestrial,1


In [4]:
# Use Pandas GetDummys to encode catagorical data
dummy_df = df[['lithology1','environment']]
encoded_dummy_df = pd.get_dummies(dummy_df)

# Concat the encoded dataframe to the clean dataframe and drop the environment, genus, and lithology1 columns
encoded_clean_df = pd.concat([encoded_dummy_df, df], axis=1)
encoded_clean_df = encoded_clean_df.drop(['environment', 'lithology1', 'genus'], axis=1)
encoded_clean_df

Unnamed: 0,lithology1_ash,lithology1_carbonate,lithology1_chalk,lithology1_chert,lithology1_claystone,lithology1_conglomerate,lithology1_diatomite,lithology1_dolomite,lithology1_grainstone,lithology1_ironstone,...,environment_spring,environment_tar,environment_terrestrial,environment_transition zone,environment_wet floodplain,max_ma,min_ma,lng,lat,THY0orTHE1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,83.5,70.6,-111.528732,50.740726,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,83.5,70.6,-111.549347,50.737015,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,83.5,70.6,-111.564636,50.723866,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,83.5,70.6,-111.525337,50.740471,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,83.5,70.6,-111.601021,50.786999,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3393,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,237.0,201.3,-109.569000,38.602001,1
3394,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,83.5,70.6,-111.564636,50.723866,0
3395,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,83.5,70.6,-108.320999,36.507999,1
3396,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,83.5,70.6,-108.320602,36.508499,1


In [5]:
# Seperate features (X) from target(y)
y = encoded_clean_df['THY0orTHE1']
X = encoded_clean_df.drop(columns='THY0orTHE1')

In [6]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [7]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Create a Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

In [9]:
# Train the data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Score the model
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9270015698587127
Testing Data Score: 0.908235294117647


In [11]:
# Predict outcomes for test data set
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
774,1,1
1294,1,1
2169,1,1
906,1,1
1658,1,1
...,...,...
1030,1,1
120,1,1
2035,1,1
1690,1,0


In [12]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.908235294117647

In [13]:
# Generate training predictions
training_predictions = classifier.predict(X_train)

# Generate testing predictions
testing_predictions = classifier.predict(X_test)

In [16]:
# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[   3  172]
 [  14 2359]]


In [17]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[  0  75]
 [  3 772]]


In [18]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

           0       0.18      0.02      0.03       175
           1       0.93      0.99      0.96      2373

    accuracy                           0.93      2548
   macro avg       0.55      0.51      0.50      2548
weighted avg       0.88      0.93      0.90      2548



In [19]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        75
           1       0.91      1.00      0.95       775

    accuracy                           0.91       850
   macro avg       0.46      0.50      0.48       850
weighted avg       0.83      0.91      0.87       850

