In [1]:
# Predicting Bee Colonies Survival 

In [2]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import sqlite3
from sqlalchemy import create_engine



In [3]:
## updated 
database_path = Path("./save_the_bees.db")

engine = create_engine(f"sqlite:///{database_path}")
conn = engine.connect()
sql_query = "SELECT * FROM bee_colonies"

df = pd.read_sql(sql_query, conn)



In [4]:
df = df[['state', 'num_colonies', 'max_colonies', 'lost_colonies',
       'percent_lost', 'added_colonies', 'renovated_colonies',
       'percent_renovated', 'quarter', 'year', 'varroa_mites',
       'other_pests_and_parasites', 'diseases', 'pesticides', 'other',
       'unknown']]
df

Unnamed: 0,state,num_colonies,max_colonies,lost_colonies,percent_lost,added_colonies,renovated_colonies,percent_renovated,quarter,year,varroa_mites,other_pests_and_parasites,diseases,pesticides,other,unknown
0,Alabama,7000,7000,1800,26,2800,250,4,1,2015,10.0,5.4,0.0,2.2,9.1,9.4
1,Arizona,35000,35000,4600,13,3400,2100,6,1,2015,26.9,20.5,0.1,0.0,1.8,3.1
2,Arkansas,13000,14000,1500,11,1200,90,1,1,2015,17.6,11.4,1.5,3.4,1.0,1.0
3,California,1440000,1690000,255000,15,250000,124000,7,1,2015,24.7,7.2,3.0,7.5,6.5,2.8
4,Colorado,3500,12500,1500,12,200,140,1,1,2015,14.6,0.9,1.8,0.6,2.6,5.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1448,West Virginia,7500,8000,1100,14,0,220,3,4,2022,33.4,3.8,0.8,0.0,6.4,0.5
1449,Wisconsin,26000,47000,3500,7,140,380,1,4,2022,23.2,21.4,19.4,17.5,9.9,11.7
1450,Wyoming,19500,21000,3200,15,640,0,0,4,2022,22.9,5.9,4.2,0.0,0.0,7.4
1451,Other,30030,30030,480,2,1190,130,0,4,2022,22.4,18.5,0.0,0.0,0.0,0.7


In [5]:
df["percent_lost"].describe()

count    1453.000000
mean       11.219546
std         7.375083
min         0.000000
25%         6.000000
50%        10.000000
75%        14.000000
max        65.000000
Name: percent_lost, dtype: float64

In [6]:
# Creating Bins for Colonies Lost

In [7]:
bins = [-1, 5, 10, 25, 66]
group_labels = [0, 1, 2, 3]
df["target"] = pd.cut(df["percent_lost"], bins, labels=group_labels)
df.dropna() 
df

Unnamed: 0,state,num_colonies,max_colonies,lost_colonies,percent_lost,added_colonies,renovated_colonies,percent_renovated,quarter,year,varroa_mites,other_pests_and_parasites,diseases,pesticides,other,unknown,target
0,Alabama,7000,7000,1800,26,2800,250,4,1,2015,10.0,5.4,0.0,2.2,9.1,9.4,3
1,Arizona,35000,35000,4600,13,3400,2100,6,1,2015,26.9,20.5,0.1,0.0,1.8,3.1,2
2,Arkansas,13000,14000,1500,11,1200,90,1,1,2015,17.6,11.4,1.5,3.4,1.0,1.0,2
3,California,1440000,1690000,255000,15,250000,124000,7,1,2015,24.7,7.2,3.0,7.5,6.5,2.8,2
4,Colorado,3500,12500,1500,12,200,140,1,1,2015,14.6,0.9,1.8,0.6,2.6,5.9,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1448,West Virginia,7500,8000,1100,14,0,220,3,4,2022,33.4,3.8,0.8,0.0,6.4,0.5,2
1449,Wisconsin,26000,47000,3500,7,140,380,1,4,2022,23.2,21.4,19.4,17.5,9.9,11.7,1
1450,Wyoming,19500,21000,3200,15,640,0,0,4,2022,22.9,5.9,4.2,0.0,0.0,7.4,2
1451,Other,30030,30030,480,2,1190,130,0,4,2022,22.4,18.5,0.0,0.0,0.0,0.7,0


In [8]:
df["target"].describe()

count     1453
unique       4
top          2
freq       625
Name: target, dtype: int64

In [9]:
# Model Iteration 1

In [10]:
y = df["target"]
X = df[["varroa_mites", "pesticides", "diseases", "unknown", "quarter", "other_pests_and_parasites", "year"]]

In [11]:
## Data cleansing

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1 
                                                    )
X_train.shape

(1089, 7)

In [13]:
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [14]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

In [15]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.5224977043158862
Testing Data Score: 0.5357142857142857


In [17]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,2,2
2,2,1
3,2,2
4,2,1
5,3,2
6,2,1
7,1,2
8,0,0
9,0,0


In [18]:
accuracy_score(y_test, predictions)

0.5357142857142857

In [19]:
# Model Iteration 2

In [20]:
# Create state dummy variables

In [21]:
## df = pd.get_dummies(df, columns=['state'])
## df

In [22]:
## y = df["percent_lost"]
## X = df.drop(columns=["percent_lost","state_United States", "state_Other", 'num_colonies', 'max_colonies', 'lost_colonies',
       'percent_lost', 'added_colonies', 'renovated_colonies', 'percent_renovated', 'quarter', 'year'])

IndentationError: unexpected indent (425144886.py, line 3)

In [None]:
## X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1 
                                                    )
## X_train.shape

In [None]:
## scaler = StandardScaler()

## X_scaler = scaler.fit(X_train)

## X_train_scaled = X_scaler.transform(X_train)
## X_test_scaled = X_scaler.transform(X_test)


In [None]:
## classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
## classifier

In [None]:
## classifier.fit(X_train, y_train)

In [None]:
## print(f"Training Data Score: {classifier.score(X_train, y_train)}")
## print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

In [None]:
## predictions = classifier.predict(X_test)
## results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
## results.head(10)

In [None]:
## accuracy_score(y_test, predictions)