# Classical Machine Learning Tutorial with SciKitLearn

*   Part 1: Random Forest Classifier
*   Part 2: Linear Regression
*   Part 3: Exercise





In [245]:
from google.colab import drive
drive.mount('/content/drive', force_remount= True)

Mounted at /content/drive


## Imports!

In [246]:
import pandas as pd
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')
import numpy as np






[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Part 1: Training a Random Forest Classifier

In [247]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/TRIADS_workshops/ml_text_analysis/sample_texts.csv")

In [248]:
#Just taking a look at the sample data
df.head(5)

Unnamed: 0,id,text,category,author,model,nation,gender,race,mean_sen_len,sentiment,TTR,lex_density,NN,VB,JJ
0,3446,"A soft strain of music sounded, and then at th...",authentic,alcott,authentic,American,female,white,23.2,-0.965358,0.719298,0.578947,0.27193,0.026316,0.087719
1,4429,"â€œThere is a demand for whisky, but I think y...",authentic,alcott,authentic,American,female,white,22.833333,-0.364037,0.689394,0.537879,0.128788,0.075758,0.060606
2,7764,I never lived out before: thatâ€™s the reason ...,authentic,alcott,authentic,American,female,white,34.0,0.997504,0.730159,0.539683,0.198413,0.047619,0.087302
3,751,"""Well, I warn you that you are trifling with t...",authentic,alcott,authentic,American,female,white,26.6,-0.991689,0.740157,0.559055,0.165354,0.023622,0.102362
4,125,"I restrain myself as long as I can, but when I...",authentic,alcott,authentic,American,female,white,16.0,-0.976345,0.68254,0.595238,0.174603,0.095238,0.111111


In [249]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            2000 non-null   int64  
 1   text          2000 non-null   object 
 2   category      2000 non-null   object 
 3   author        2000 non-null   object 
 4   model         2000 non-null   object 
 5   nation        2000 non-null   object 
 6   gender        2000 non-null   object 
 7   race          2000 non-null   object 
 8   mean_sen_len  2000 non-null   float64
 9   sentiment     2000 non-null   float64
 10  TTR           2000 non-null   float64
 11  lex_density   2000 non-null   float64
 12  NN            2000 non-null   float64
 13  VB            2000 non-null   float64
 14  JJ            2000 non-null   float64
dtypes: float64(7), int64(1), object(7)
memory usage: 234.5+ KB


In [250]:
# Defining the target variable and the prediction features
y = df.category
X = df[['mean_sen_len', 'sentiment', 'TTR', 'lex_density', 'NN', 'VB', 'JJ']]


In [251]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [252]:
#Take a look at the feature data in the train set
X_train.head()

Unnamed: 0,mean_sen_len,sentiment,TTR,lex_density,NN,VB,JJ
968,36.75,-0.972402,0.744681,0.631206,0.269504,0.014184,0.106383
240,26.4,0.999101,0.705426,0.51938,0.170543,0.046512,0.046512
819,14.555556,-0.995631,0.762712,0.567797,0.186441,0.025424,0.067797
692,24.5,-0.99467,0.729167,0.520833,0.229167,0.010417,0.0625
420,35.25,-0.370412,0.728682,0.55814,0.170543,0.054264,0.054264


In [253]:
# Initialize the Random Forest model
model = RandomForestClassifier(random_state=42)

In [254]:
# Train the model
model.fit(X_train, y_train)

In [255]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [256]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.96


In [257]:
#Detailed classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   authentic       0.94      0.97      0.96       193
   synthetic       0.98      0.95      0.96       207

    accuracy                           0.96       400
   macro avg       0.96      0.96      0.96       400
weighted avg       0.96      0.96      0.96       400



In [258]:
#Evaluating incorrect classifications

incorrect_indices = y_test != y_pred

# Use these indices to select incorrect rows
incorrectly_labeled = X_test[incorrect_indices].copy()

# Add actual and predicted categories
incorrectly_labeled['actual_category'] = y_test[incorrect_indices]
incorrectly_labeled['predicted_category'] = y_pred[incorrect_indices]

# Print the incorrectly labeled rows sorted by actual category
print("Incorrectly labeled samples (correct/predicted):")
print(incorrectly_labeled[['actual_category', 'predicted_category']].sort_values('actual_category'))


Incorrectly labeled samples (correct/predicted):
     actual_category predicted_category
30         authentic          synthetic
44         authentic          synthetic
485        authentic          synthetic
1803       authentic          synthetic
1449       authentic          synthetic
1981       synthetic          authentic
1905       synthetic          authentic
949        synthetic          authentic
1728       synthetic          authentic
1950       synthetic          authentic
1990       synthetic          authentic
1934       synthetic          authentic
1937       synthetic          authentic
581        synthetic          authentic
1924       synthetic          authentic
1745       synthetic          authentic


In [259]:
#Calculate which features were most important for the classification

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_})

# Sort features by importance
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

# Print the feature importance
print(feature_importance)

        feature  importance
4            NN    0.301364
5            VB    0.220186
6            JJ    0.137444
1     sentiment    0.120191
0  mean_sen_len    0.112683
3   lex_density    0.074183
2           TTR    0.033948


## Part 2: Linear Regression for Continuous Variable Data

In [260]:
# Get the dataset
airbnb_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/TRIADS_workshops/ml_text_analysis/AB_NYC_2019.csv")


In [261]:
# Take a look at the data
airbnb_data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [262]:
# Define prediction and target variables

X = airbnb_data['name'].astype(str)
y = airbnb_data['price']

In [263]:
# Vectorize text into tf-idf scores
vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)


In [264]:
# Create Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)


In [265]:
# Fit a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [266]:
# 5. Predict and evaluate
y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))

R² Score: -0.4698674276377539


In [267]:
# Get most predictive feature based on model co-efficients

feature_names = vectorizer.get_feature_names_out()
coefs = model.coef_

coef_df = pd.DataFrame({'word': feature_names, 'coefficient': coefs})
top_word = coef_df.sort_values(by='coefficient', ascending=False).iloc[0]['word']

print(f"Most predictive word: '{top_word}'")


Most predictive word: 'manhattans'


# Exercise

Use data of your own to run either a binary classifier or linear regression in a different code notebook. Remember, you want target and prediction variables.

Note: If you have text data and do not have features, please see the code below to auto-generate features for binary classification