<a href="https://colab.research.google.com/github/DomLah/EU-AI-Act/blob/main/Wealth_Dom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

In [44]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Loading the Denver file as a Training file

In [12]:
file_path = '/content/drive/My Drive/Colab/DenverForModeling.xls'
df = pd.read_excel(file_path)

In [13]:
# Print the first few rows (header) of the result DataFrame
print(df.head())

           Full Name                             Url  Twitter Followers  \
0      Dan Daugherty              http://twitter.com                NaN   
1         Ann Pierce       http://twitter.com/_annp_                1.0   
2  Christopher Klein  http://twitter.com/_chrisklein              181.0   
3       Jeffrey Lowe    http://twitter.com/_jefflowe               78.0   
4    Torrie Hrdlicka  http://twitter.com/_lil_tortor               38.0   

                                                 Bio          Job Title  \
0                                                NaN    CEO and Founder   
1                                                NaN                NaN   
2  Living in Denver - I'm a software engineer and...  Cofounder and CPO   
3                 Co-Founder @EquipmentShare - YCW15         Co-Founder   
4  üìÜ 24 |\rüì±Digital Media Production | üé•...                NaN   

          Company Gender  Dob Start    Dob End Label:Category  
0        RentBits    Man        Na

Add a column if the bio is empty

In [14]:
# Add a new column 'Empty_bio' which is True if the bio is empty or only contains whitespace
df['Empty_bio'] = df['Bio'].apply(lambda bio: True if not str(bio).strip() else False)

Loading the list of the most frequent keywords in the bio

In [15]:
# Define the list of keywords
keywords = [
    'advisor', 'acquired', 'ai', 'alum', 'angel', 'author', 'artist', 'board', 'business', 'capital', 'cardiac','care'
    'ceo', 'cto', 'cio', 'cfo', 'cmo', 'chief', 'coach', 'cofounder', 'companies', 'company', 'cto',
    'dad', 'data', 'dentist', 'design', 'development', 'digital', 'director', 'emergency','engineer',
    'enthusiast', 'entrepreneur', 'estate', 'executive', 'family', 'former', 'founder',
    'global', 'growth', 'guy', 'health', 'helping', 'husband', 'innovation',
    'investor', 'investment', 'leader', 'life', 'love', 'lover', 'make', 'managing', 'marketing',
    'media', 'medicine', 'member', 'music', 'nerd', 'officer', 'opinions', 'owner', 'partner', 'physician',
    'president', 'product', 'real', 'science','social', 'software', 'speaker', 'specialist','sports',
    'startups','surgeon', 'surgery' 'tech', 'techstars', 'technology', 'things', 'tweets', 'vice', 'venture', 'writer'
]
# Convert keywords list to a set to remove any duplicates
keywords = list(set(keywords))



In [16]:
# Create new columns based on the presence of keywords in 'Bio'
for keyword in keywords:
    # Create a unique column name for each keyword
    column_name = 'kw_' + keyword
    df[column_name] = df['Bio'].apply(lambda bio: 1 if keyword in str(bio).lower() else 0)


In [17]:
# Define a function to categorize into generational groups based on birth year
def categorize_generation(row):
    if pd.isna(row['Dob Start']):
        return 'Unknown'  # Return 'Unknown' if the date is NaN
    year = row['Dob Start'].year
    if 1997 <= year <= 2012:
        return 'Gen Z'
    elif 1981 <= year <= 1996:
        return 'Millennials'
    elif 1965 <= year <= 1980:
        return 'Gen X'
    elif 1946 <= year <= 1964:
        return 'Baby Boomers'
    elif year <= 1945:
        return 'Silent Generation'
    else:
        return 'Unknown'

In [18]:
# Convert 'Dob Start' to datetime and extract the year
df['Dob Start'] = pd.to_datetime(df['Dob Start'], errors='coerce')
df['Birth Year'] = df['Dob Start'].dt.year
df['Generational Group'] = df.apply(categorize_generation, axis=1)

In [19]:
# Create new columns based on the presence of keywords in 'Bio'
for keyword in keywords:
    df[keyword] = df['Bio'].apply(lambda bio: 1 if keyword in str(bio).lower() else 0)


Add brackets for the number of followers

In [20]:
# Define bins and labels for Twitter Followers
bins = [0, 100, 500, 1000, 5000, 20000, 50000, 200000, float('inf')]
labels = ['1-100', '100-500', '500-1000', '1000-5000', '5000-20000', '20000-50000', '50000-200000', 'over200000']


In [21]:
# Bin the 'Twitter Followers' data
df['Follower_Category'] = pd.cut(df['Twitter Followers'], bins=bins, labels=labels, include_lowest=True)


In [22]:
# Remove rows where the category is 'Celebrity'
df = df[df['Label:Category'] != 'Celebrity']

In [23]:
# Select features and target for the model
# Feature columns
feature_cols = ['Job Title', 'Company', 'Gender', 'Generational Group', 'Empty_bio', 'Follower_Category'] + ['kw_' + k for k in keywords]
X = df[feature_cols]
y = df['Label:Category']

In [24]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [25]:
print(X_train.head())


                          Job Title               Company Gender  \
497               Managing Director              Deloitte    Man   
1454             CEO and co-Founder         Solid Silicon    Man   
1283  VP Marketing & Communications            Infleqtion  Woman   
983               Basketball Player  Los Angeles Clippers    Man   
1273                            NaN                   NaN  Woman   

     Generational Group  Empty_bio Follower_Category  kw_development  \
497             Unknown      False           100-500               0   
1454              Gen X      False             1-100               0   
1283              Gen X      False         1000-5000               0   
983             Unknown      False      50000-200000               0   
1273              Gen X      False           100-500               0   

      kw_former  kw_officer  kw_enthusiast  ...  kw_data  kw_surgerytech  \
497           0           0              0  ...        0               0   
1454  

In [26]:
# Define preprocessing steps for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')


In [27]:
# Define a column transformer to apply the appropriate preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, feature_cols)
    ])


In [28]:
# Create and configure the Gradient Boosting classifier model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier())
])


In [30]:
# Train the model with the training data
model.fit(X_train, y_train)

In [31]:
# Predict the categories for the test set
y_pred = model.predict(X_test)


In [78]:
# Print a classification report to evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

Entrepreneur       0.49      0.75      0.60       100
   Executive       0.73      0.31      0.43        72
    Investor       0.68      0.57      0.62        40
         OUT       0.51      0.53      0.52        80
Professional       0.71      0.50      0.59        10
 Real Estate       0.75      0.43      0.55         7

    accuracy                           0.55       309
   macro avg       0.65      0.51      0.55       309
weighted avg       0.59      0.55      0.54       309



APPLYING THE MODEL TO A NEW FILE.**bold text**


In [65]:
# Load the new data (replace 'NewData.xls' with your actual file name)
new_data_path = '/content/drive/My Drive/Colab/DallasMusicToScore.xls'  # Update the path as needed
new_df = pd.read_excel(new_data_path)


In [66]:
# Method 1: Using `shape`
num_records_shape = new_df.shape[0]
print("Number of records using `shape`:", num_records_shape)

# Method 2: Using `len()`
num_records_len = len(new_df)
print("Number of records using `len()`:", num_records_len)

Number of records using `shape`: 7385
Number of records using `len()`: 7385


In [67]:
# Preprocess the new data
# Add the 'Empty_bio' column
new_df['Empty_bio'] = new_df['Bio'].apply(lambda bio: True if not str(bio).strip() else False)


In [68]:
# Create keyword columns
for keyword in keywords:
    column_name = 'kw_' + keyword
    new_df[column_name] = new_df['Bio'].apply(lambda bio: 1 if keyword in str(bio).lower() else 0)


In [69]:
# Convert 'Dob Start' to datetime and extract the year
new_df['Dob Start'] = pd.to_datetime(new_df['Dob Start'], errors='coerce')
new_df['Birth Year'] = new_df['Dob Start'].dt.year
new_df['Generational Group'] = df.apply(categorize_generation, axis=1)


In [70]:
# Bin the 'Twitter Followers' data
new_df['Follower_Category'] = pd.cut(new_df['Twitter Followers'], bins=bins, labels=labels, include_lowest=True)


In [71]:
# Prepare the feature set (ensure this matches the features used in your model)
new_X = new_df[['Job Title', 'Company', 'Gender', 'Generational Group', 'Empty_bio','Follower_Category'] + ['kw_' + k for k in keywords]]


In [72]:
# Make predictions
new_predictions = model.predict(new_X)


Creating a result file with the url, the predicted category and the probability

In [73]:
# Assuming 'new_df' is your new data DataFrame and 'model' is your trained model

# Make predictions and get probabilities
# predicted_categories = model.predict(new_X)
predicted_probabilities = model.predict_proba(new_X)


In [74]:
# Get the highest probability and its corresponding category for each instance
max_probs = predicted_probabilities.max(axis=1)
predicted_labels = [model.classes_[np.argmax(prob)] for prob in predicted_probabilities]


In [75]:
# Define probability bins and labels
prob_bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
prob_labels = ['0-10%', '10-20%', '20-30%', '30-40%', '40-50%', '50-60%', '60-70%', '70-80%', '80-90%', '90-100%']

# Categorize each probability into a bracket
prob_brackets = pd.cut(max_probs, bins=prob_bins, labels=prob_labels, include_lowest=True)


In [76]:
# Prepare the result DataFrame
result_df = pd.DataFrame({
    'URL': new_df['Url'],
    'Predicted Category': predicted_labels,
    'Probability': max_probs,
    'Probability Bracket': prob_brackets
})

In [77]:
# Save the results to a CSV file
result_file_path = '/content/drive/My Drive/Colab/prediction_dallasmusic.csv'  # Update the path as needed
result_df.to_csv(result_file_path, index=False)

print("Results saved to", result_file_path)

Results saved to /content/drive/My Drive/Colab/prediction_dallasmusic.csv
