In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [8]:
df = pd.read_csv('../dataset/clean_texas.csv')
df.head(50)

Unnamed: 0,Animal ID,date_intake,intake_type,intake_condition,sex_intake,age_intake,breed,color,date_outcome,dob,outcome_type,sex_outcome,age_outcome,stay_length
0,A664887,2013-10-10 13:48:00,Stray,Normal,Intact Female,1 month,Domestic Shorthair,Black,2013-11-10 16:56:00,08/10/2013,Adoption,Spayed Female,3 months,31
1,A665496,2013-10-18 18:07:00,Stray,Normal,Neutered Male,3 years,Other/Mix,Brown,2013-10-22 17:54:00,04/18/2010,Adoption,Neutered Male,3 years,3
2,A664936,2013-10-11 11:20:00,Stray,Normal,Intact Male,1 month,Domestic Medium,Brown,2013-12-18 18:18:00,08/18/2013,Adoption,Neutered Male,4 months,68
3,A665398,2013-10-17 12:26:00,Owner Surrender,Normal,Intact Female,1 month,Domestic Shorthair,White,2013-11-10 16:45:00,08/19/2013,Adoption,Spayed Female,2 months,24
4,A665426,2013-10-17 16:53:00,Stray,Normal,Intact Male,1 month,Domestic Shorthair,Black,2013-11-16 17:09:00,08/26/2013,Adoption,Neutered Male,2 months,30
5,A666091,2013-10-26 16:43:00,Stray,Normal,Intact Male,1 month,Domestic Medium,Black,2013-12-06 15:54:00,09/11/2013,Adoption,Neutered Male,2 months,40
6,A665994,2013-10-25 17:41:00,Stray,Normal,Intact Female,3 months,Domestic Shorthair,Calico,2013-11-05 19:22:00,07/25/2013,Adoption,Spayed Female,3 months,11
7,A664857,2013-10-10 07:47:00,Stray,Normal,Intact Male,6 months,Domestic Shorthair,Black,2013-10-14 17:34:00,04/10/2013,Adoption,Neutered Male,6 months,4
8,A664754,2013-10-08 16:46:00,Owner Surrender,Normal,Intact Male,5 months,Domestic Shorthair,Black,2013-10-13 18:03:00,04/23/2013,Adoption,Neutered Male,5 months,5
9,A664446,2013-10-03 13:06:00,Stray,Injured,Intact Male,3 years,Domestic Shorthair,Orange,2014-03-15 15:27:00,10/03/2010,Adoption,Neutered Male,3 years,163


In [9]:
# Assuming df is your DataFrame
df = df.drop(['Animal ID','date_intake','date_outcome','outcome_type','dob'], axis=1)
# Drop rows where 'stay_length' is negative
df = df[df['stay_length'] >= 0]

In [10]:
def years_to_months(age):
    try:
        # Extracting numeric part from the string and converting it to an integer
        age_numeric = int(age.split()[0])
        return age_numeric * 12 if age_numeric > 0 else 0
    except ValueError:
        # Handling the case where the conversion to an integer fails
        return 0

# Applying the conversion to the specified columns
df['age_intake'] = df['age_intake'].apply(years_to_months)
df['age_outcome'] = df['age_outcome'].apply(years_to_months)

In [12]:


# # Assuming 'color', 'breed', 'intake_type', 'intake_condition', 'sex_intake', 'sex_outcome' are your categorical columns

# # Create dummy variables for categorical columns
categorical_columns = ['intake_type', 'intake_condition', 'sex_intake', 'sex_outcome', 'color', 'breed']
# df = pd.get_dummies(df, columns=categorical_columns)


In [13]:
# Label Encoding:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])


In [14]:
df.head(50)

Unnamed: 0,intake_type,intake_condition,sex_intake,age_intake,breed,color,sex_outcome,age_outcome,stay_length
0,3,7,0,12,2,0,3,36,31
1,3,7,2,36,3,2,2,36,3
2,3,7,1,12,1,2,2,48,68
3,1,7,0,12,2,8,3,24,24
4,3,7,1,12,2,0,2,24,30
5,3,7,1,12,1,0,2,24,40
6,3,7,0,36,2,3,3,36,11
7,3,7,1,72,2,0,2,72,4
8,1,7,1,60,2,0,2,60,5
9,3,3,1,36,2,4,2,36,163


In [15]:
column_names = df.columns.tolist()
print(column_names)

['intake_type', 'intake_condition', 'sex_intake', 'age_intake', 'breed', 'color', 'sex_outcome', 'age_outcome', 'stay_length']


In [8]:
# # Replace these placeholders with your actual column names
# feature_columns = ['age_intake', 
#                    'intake_type_Abandoned', 'intake_type_Owner Surrender', 
#                    'intake_type_Public Assist', 'intake_type_Stray',
#                    'intake_condition_Aged', 'intake_condition_Behavior', 
#                    'intake_condition_Feral', 'intake_condition_Injured',
#                    'intake_condition_Med Attn', 'intake_condition_Medical', 
#                    'intake_condition_Neonatal', 'intake_condition_Normal', 
#                    'intake_condition_Nursing', 'intake_condition_Other', 
#                    'intake_condition_Pregnant', 'intake_condition_Sick', 
#                    'intake_condition_Unknown', 'sex_intake_Intact Female',
#                    'sex_intake_Intact Male', 'sex_intake_Neutered Male', 
#                    'sex_intake_Spayed Female', 'sex_outcome_Intact Female',
#                    'sex_outcome_Intact Male', 'sex_outcome_Neutered Male', 
#                    'sex_outcome_Spayed Female', 'color_Black', 'color_Blue', 
#                    'color_Brown', 'color_Calico', 'color_Orange', 'color_Other',
#                    'color_Torbie', 'color_Tortie', 'color_White', 'breed_Domestic Longhair', 
#                    'breed_Domestic Medium', 'breed_Domestic Shorthair', 'breed_Other/Mix']
# target_column = 'stay_length'

# # Extract features (X) and target variable (y)
# X = df[feature_columns]
# y = df[target_column]

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Create a Random Forest Regressor with hyperparameter tuning
# random_forest = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=5, random_state=42)

# # Train the model
# random_forest.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = random_forest.predict(X_test)

# # Evaluate the model
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)
# print(f'Mean Squared Error: {mse}')
# print(f'R-squared (R2) Score: {r2}')

# # Visualization of predicted vs actual values
# plt.scatter(y_test, y_pred, color='blue', label='Predicted vs Actual')
# plt.xlabel('Actual Values')
# plt.ylabel('Predicted Values')
# plt.title('Random Forest Regression: Predicted vs Actual')
# plt.legend()
# plt.show()

In [16]:
df.to_csv('labelencoding_texas.csv', index=False)

In [11]:
df = pd.read_csv('../dataset/dummies_texas.csv')
df.head(50)

Unnamed: 0,age_intake,age_outcome,stay_length,intake_type_Abandoned,intake_type_Owner Surrender,intake_type_Public Assist,intake_type_Stray,intake_condition_Aged,intake_condition_Behavior,intake_condition_Feral,...,color_Calico,color_Orange,color_Other,color_Torbie,color_Tortie,color_White,breed_Domestic Longhair,breed_Domestic Medium,breed_Domestic Shorthair,breed_Other/Mix
0,12,36,31,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,36,36,3,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,12,48,68,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,12,24,24,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,12,24,30,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,12,24,40,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,36,36,11,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
7,72,72,4,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,60,60,5,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,36,36,163,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
