In [None]:






import os
import math
import re
from datetime import datetime
import pandas as pd
import numpy as np
import concurrent.futures

# Function to convert words to numbers
def word_to_number(word):
    word_num_dict = {
        'one': 1,
        'two': 2,
        'three': 3,
        'four': 4,
        'five': 5,
        'six': 6,
        'seven': 7,
        'eight': 8,
        'nine': 9,
        'ten': 10,
        'eleven': 11,
        'twelve': 12,
        'thirteen': 13,
        'fourteen': 14,
        'fifteen': 15,
        'sixteen': 16,
        'seventeen': 17,
        'eighteen': 18,
        'nineteen': 19,
        'twenty': 20,
        'thirty': 30,
        'forty': 40,
        'fifty': 50,
        'sixty': 60,
        'seventy': 70,
        'eighty': 80,
        'ninety': 90,
        'hundred': 100,
        'half-century': 50,
        'silver jubilee': 25,
        'golden jubilee': 50,
        'diamond jubilee': 75,
        'millennial': 32,  # as of 2023, average age of millennials is around 32
        'baby boomer': 73,  # as of 2023, average age of baby boomers is around 73
        'twice century': 200,
        'decade': 10,
        'jubilee': 50,  # assumed as golden jubilee
        'millennium': 1000  # unlikely, but let's include for completeness
    }
    return word_num_dict.get(word.lower())
    # ...

def extract_age_nlp_v3(text):
    # Check if description is not a string or is empty
    if not isinstance(text, str) or text.strip() == "":
        return float('nan')
    
    # Current year
    current_year = datetime.now().year

    # Patterns to ignore
    ignore_patterns = [
          # ignore words with embedded numbers e.g. "RAMA20"
        r'\s*(\d+(\.\d+)?%)\s*',  # ignore percentages
        r'co/\d+\w+',  # ignore codes like "co/34cEoBAeiw"
        r'\d+/\d+(/\d+)?'  # ignore dates like "24/7", "7/24/2020"
    ]

    # If the text matches any of the ignore patterns, return NaN
    for pattern in ignore_patterns:
        if re.search(pattern, text, re.IGNORECASE):
            return float('nan')
    
    # Patterns to look for
    patterns = [
        (r'(\d{1,3})\s*(?:years|year|yrs|yr|y|old)', 1),
        (r'(?:age|aged)\s*(\d{1,3})', 1),
        (r'(\d{4})\s*(?:s|\'s)', 1),
        (r'(\d{1,3})\s*(?:completed)', 1),
        (r'(?:(?:my|the)\s*)?(\d{1,2})(?:st|nd|rd|th)\s*decade', 1),
        (r'(?:moon landing)', 0),
        (r'(?:turning)\s*(\d{1,3})', 1),
        (r'(?:celebrating my|gonna be my)\s*(\d{1,3})', 1),
        (r'(\d{1,3})\s*(?:anniversary)', 1),
        (r'(?:grand(?:father|mother)|mom|mother|dad|father)\s*of\s*(\d{1}|one|two|three|four|five|six)', 1),
    ]

    for pattern, group_index in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            age = match.group(group_index)
            if 'moon landing' in pattern:
                birth_year = 1969
                age = current_year - birth_year
            elif pattern == r'(?:grand(?:father|mother|mom|mother|dad|father))\s*of\s*(\d{1}|one|two|three|four|five|six)':
                if age.isdigit():
                    age = 45 + int(age)
                else:
                    age = 45 + word_to_number(age)
            elif pattern == r'(?:mom|mother|dad|father)\s*of\s*(\d{1}|one|two|three|four|five|six)':
                if age.isdigit():
                    age = 21 + int(age)
                else:
                    age = 21 + word_to_number(age)
            else:
                if age.isdigit():
                    age = int(age)
                else:
                    age = word_to_number(age)
            
            if 16 <= age <= 90:
                return age

    # If no age found, return NaN
    return float('nan')

# '''
# # Age-related keywords grouped by age group
age_groups_keywords = {
    "Thirties": ["thirties", "30s","father", "mother",'wife','husband', "mum", "mom", "dad", "daddy", "mummy", "mommy",'hubby',
                 "middle age", "career", "family", "parents", "parent", "professor","married",
                       "manager",  "professional", "homeowner", 'educator'
                       'real estate', 'entrepreneur', 
                       'remote work', 'telecommute', 'family', 'children', 'child', 'kid', 
                      'married', 'wedding', 'marriage', 'husband', 'wife', 'mother', 'father', 
                      'career'],
    
    "Twenties": ["doctoral", "graduating", "student","bsc", "master", "msc", "masters", "twenties", "young adult", 
                  "millennial", "20s", "college", "youthful", "young professional", "investing","crypto enthusiast",
                  "graduate",'college graduate', 'bachelor', 'master', 'PhD', 'postdoc', 'intern', 
                  'internship', 'entry-level', 'apprentice', 'trainee', 'graduate', 'student', 
                  'university', 'fraternity', 'sorority', 'study abroad', 'exchange student', 
                  'backpacker', 'gap year','new mom','new dad','early mom', 'early dad'],
    
    
    "Adolescent": ["teen", "adolescent", "youth", "youngster", "high school", "junior", "underage","school", 
                        "teenager", "gamer", "undergraduate", "freshman", "sophomore", "junior",
                       'middle school', 'freshman', 'varsity', 'junior varsity', 
                       'teen', 'teenage', 'tiktok', 'snapchat', 'instagram', 'game', 'gamer', 'minecraft', 
                       'fortnite', 'esports']
   
   
}



def updated_extract_age_group_from_description(description):
    # Check if description is not a string or is empty
    if not isinstance(description, str) or description.strip() == "":
        return "Unknown"
    
    # High priority check
    for keyword in priority_keywords["high"]:
        if re.search(keyword, description, re.IGNORECASE):
            return "Golden Years"
    
    # Medium priority check (If needed later on)
    for keyword in priority_keywords["Forties"]:
        if re.search(keyword, description, re.IGNORECASE):
            return "Forties"  # Adjusting to 'Forties' as it seems to be the closest fit for medium priority
    
    # If none of the priority keywords are found, fall back to the original approach
    for age_group, keywords in age_groups_keywords.items():
        if any(keyword in description.lower() for keyword in keywords):
            return age_group
    
    return "Unknown"

priority_keywords = {
    "high": ["grand father of","grand mother of", "retired", "ret", "old age pensioner", "nanna", "nanni", 
             "grand-children", "grand kid", "grandpa", "grandma", "oldie","grampa" ,"gramma"
             "25+ years of experience", "25+ years of research","25+ years of playing",
             "retired veteren","retiree","grand - childrens","grand child","grand kids","grand children",
             "grandmother","grandfather","grandpa","grandma","charity","retirement","elder","oldie","oldy","old soul","fifties",
             "sixties","senior citizen", "elderly", "50s", "60s", "70s", "80s", "retiree", "grandparent", 
             "old age", "pensioner", "golden years", "silver-haired","elderly","golf","pension","medicare"],
    
    "Forties": [  "md", "ceo", "forties", "40s", "midlife", "family","director", "business owner"
                    "executive", "senior", 'cfo', 'cto', 'manager', 'director',"producer","writer"
                    'veteran'   , 'specialist', 'consultant', 'advisor', "outdated","survivor",
                    'established', 'experienced', 'middle-aged', 'midlife', 'mid-life' ,"father of 4","father of 5","mom of 4","mom of 5"],
    
    
}



def extract_age_group(age):
    if age < 20:
        return "Adolescent"
    elif age >= 20 and age < 30:
        return "Twenties"
    elif age >= 30 and age < 40:
        return "Thirties"
    elif age >= 40 and age < 50:
        return "Forties"
    elif age >= 50:
        return "Golden Years"
    else:
        return "Unknown"

def process_chunk(chunk):
    # Apply the extract_age_nlp_v3 function to the description column of the DataFrame
    chunk['age'] = chunk['user_description'].apply(extract_age_nlp_v3)
    chunk['Estimated Age Group'] = chunk['user_description'].apply(updated_extract_age_group_from_description)
    chunk['Categorized Age Group'] = chunk['age'].apply(extract_age_group)
    # If the 'Estimated Age Group' column is 'Unknown' and the age has been extracted successfully,
    # replace 'Estimated Age Group' with the value from the 'Categorized Age Group' column
    chunk.loc[(chunk['Estimated Age Group'] == "Unknown") & (~chunk['age'].isna()), 'Estimated Age Group'] = chunk['Categorized Age Group']
    return chunk

# ... [Code to split the DataFrame into chunks]
def chunks(df, n):
    return [df[i:i + n] for i in range(0, len(df), n)]
# Load the data
df_age = pd.read_excel("C:/Users/hp/Desktop/Data/output_corona_gender_pro.xlsx")

# Number of threads based on the system's capabilities
num_threads = min(32, (os.cpu_count() + 8))

# Chunk size based on the number of threads and size of the DataFrame
chunk_size = math.ceil(len(df_age) / num_threads)
 
# Create the chunks
df_chunks = chunks(df_age, chunk_size)

# Process chunks concurrently using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    result_chunks = list(executor.map(process_chunk, df_chunks))

# Concatenate the results from all threads
df_age = pd.concat(result_chunks)

# Save the data to a new Excel file
df_age.to_excel("C:/Users/hp/Desktop/Data/output_corona_gender_pro_age.xlsx", index=False)


In [None]:
### Occupation Extraction

In [None]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed



# Categorizing the initial keywords and adding new ones based on the dataset

occupational_classes = {
    "Creative & Media": [
        'DJ', 'SEO specialist', 'UX designer', 'actor', 'analyst', 'anchor', 'announcer', 'artist', 'author', 
        'board director', 'broadcaster', 'broadcaster', 'columnist', 'copyeditor', 'copywriter', 'correspondent', 
        'dancer', 'designer', 'dev', 'developer', 'digital expert', 'digital marketer', 'director', 'drummer', 
        'editor', 'engineer', 'essayist', 'executive director', 'executive producer', 'film producer', 'filmmaker', 
        'gamer', 'guitarist', 'illustrator', 'influencer', 'journalist', 'muralist', 'music producer', 'musician', 
        'newscaster', 'novelist', 'painter', 'photo editor', 'playwright', 'podcast', 'producer', 'proofreader', 
        'radio', 'radio host', 'radioman', 'rapper', 'reporter', 'screenwriter', 'sculptor', 'signals expert', 
        'stage director', 'streamer', 'tech', 'video editor', 'videographer', 'web developer', 'writer', 'youtuber'
    ],
    "Business & Economics": [
        'accountant', 'advisor', 'advisor', 'auditor', 'banker', 'business owner', 'businessman', 'ceo', 'chairman', 
        'chief executive', 'co-founder', 'consultant', 'corporate officer', 'counselor', 'economist', 'entrepreneur', 
        'executive', 'financial adviser', 'financial advisor', 'financial analyst', 'financial expert', 'financial planner', 
        'founder', 'innovator', 'investment banker', 'investor', 'manager', 'owner', 'proprietor', 'realestate', 
        'realtor', 'shareholder', 'stakeholder', 'startup founder', 'stock trader', 'strategist', 'trader'
    ],
    "Service & Health": [
        'academician', 'activist', 'advocate', 'army officer', 'champion', 'dietician', 'doctor', 'educator', 'graduate', 
        'health professional', 'instructor', 'judge', 'law enforcement officer', 'lawyer', 'lecturer', 'lobbyist', 
        'marine', 'masters', 'military officer', 'naval officer', 'navy officer', 'nurse', 'pharmacist', 'police officer', 
        'professor', 'promoter', 'prosecutor', 'public officer', 'researcher', 'scholar', 'student', 'teacher', 'therapist', 
        'tutor', 'undergraduate'
    ],
    "Sports & Entertainment": [
        'athlete', 'baseball player', 'basketball player', 'boxer', 'coach', 'cricketer', 'cyclist', 'dancer', 'fitness trainer', 
        'footballer', 'golfer', 'gymnast', 'hockey player', 'referee', 'rugby player', 'runner', 'skateboarder', 'skier', 
        'snowboarder', 'soccer player', 'sports analyst', 'sports commentator', 'sports journalist', 'sports photographer', 
        'surfer', 'swimmer', 'tennis player', 'wrestler', 'yoga instructor'
    ]
}


# Define the classification function
def classify_occupation(description):
    description = description.lower()
    for occupation_class, keywords in occupational_classes.items():
        if any(keyword in description for keyword in keywords):
            return occupation_class
    return 'Unclassified'

# Multi-threaded classification function
def multi_threaded_classification(data_chunk):
    return data_chunk['user_description'].fillna("").apply(classify_occupation)

def main():
    # 1. Load the data
    data = pd.read_excel("C:/Users/aj5n22/Downloads/Fanil.xlsx")
    
  

    # Split data for parallel processing
    num_processors = 4  # Adjust based on your machine's cores
    data_chunks = np.array_split(data, num_processors)

    # Use joblib for parallel processing
    results = Parallel(n_jobs=-1)(delayed(multi_threaded_classification)(chunk) for chunk in data_chunks)
    data['Occupational Class'] = pd.concat(results)
    
    # 5. Save the results
    data.to_excel("C:/Users/aj5n22/Downloads/occupation.xlsx", index=False)

if __name__ == '__main__':
    main()


### Gender Extraction

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Fri Aug  4 15:52:11 2023

@author: aj5n22
"""

import os
import math
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import re

 
# Define gender-related pronouns using regex patterns
male_word_pattern = re.compile(r"\b(he|him|his|man|boy|gentleman|father|uncle|male|forever hers|grandpa|grandfather|husband|boyfriend|mister|my missus|son|my girlfriend|mister|dad|daddy|guy|my girl|bruh|wife's|@mr|my wife|my ex wife)\b", re.IGNORECASE)
female_word_pattern = re.compile(r"\b(she|her|woman|lady|girl|mrs.|mother|mom|mummy|grandma|grandmother|aunt|aunty|auntie|female|forever his|wife|girlfriend|daughter|my mister|my boyfriend|my guy|husband's|@mrs|my husband|my ex husband)\b", re.IGNORECASE)

# Define regex pattern for specific words to categorize as "Unknown"
words_pattern = re.compile(r"\b(man utd|man city|man united|man van|bubble man|boy meets|like an old man|(T)he|she he|he she|he's|she's|he is|she is|man is|president|heavenly father|vote)\b", re.IGNORECASE)
    
  

# Inside extract_gender_and_brand function
def extract_gender_keyword(user_description):
    if not isinstance(user_description, str) or user_description.strip() == "":
        return float("nan")
    
    user_description_lower = user_description.lower()
    
    if pd.isnull(user_description):
        return "Unknown"
    
    # Check if any of the specific words is present in the user description and categorize as "Unknown"
    if re.search(words_pattern, user_description_lower):
        return "Unknown"
    
    male_match = re.search(male_word_pattern, user_description_lower)
    female_match = re.search(female_word_pattern, user_description_lower)
    
    if male_match and not female_match:
        return "Male"
    elif female_match and not male_match:
        return "Female"
      
    return "Unknown"



if __name__ == "__main__":
    input_file = "C:/Users/hp/Desktop/Data/gender_un.xlsx"
    output_file = "C:/Users/hp/Desktop/Data/output_corona_gender_pro.xlsx"

 

    df = pd.read_excel(input_file)

 

    num_threads = min(32, (os.cpu_count() + 8))
    chunk_size = math.ceil(len(df) / num_threads)

 

    # Process each user description concurrently using ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        df['gender'] = list(executor.map(extract_gender_and_brand, df['user_description']))

 

    # Save the updated DataFrame to a new Excel file
    df.to_excel(output_file, index=False)

### Classification Analysis

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb
from scipy.sparse import hstack
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# 1. Load the dataset
data = pd.read_csv("/mnt/data/Final(Sheet1).csv")

# 2. Preprocess the data
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    return text

# Drop rows where 'tweets' is NaN
data = data.dropna(subset=['tweets'])
data['tweets'] = data['tweets'].apply(preprocess_text)

# Function to train and evaluate models
def train_and_evaluate(X, y, label, tfidf_max_features=2500, test_size=0.2, random_state=42):
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=test_size, random_state=random_state)

    tfidf_tweets = TfidfVectorizer(max_features=tfidf_max_features, stop_words='english', ngram_range=(1, 3))
    X_train_tweets_tfidf = tfidf_tweets.fit_transform(X_train['tweets'])
    encoder = OneHotEncoder(drop='first')
    X_train_encoded = encoder.fit_transform(X_train[['Estimated Age Group', 'Gender', 'Occupation']])
    X_train_combined = hstack([X_train_tweets_tfidf, X_train_encoded])

    X_test_tweets_tfidf = tfidf_tweets.transform(X_test['tweets'])
    X_test_encoded = encoder.transform(X_test[['Estimated Age Group', 'Gender', 'Occupation']])
    X_test_combined = hstack([X_test_tweets_tfidf, X_test_encoded])

    # Subset the training data for grid search
    SAMPLE_FRACTION = 0.5
    X_train_sample, _, y_train_sample, _ = train_test_split(X_train_combined, y_train, test_size=1-SAMPLE_FRACTION, random_state=random_state)

    param_grids = {
        'MultinomialNB': {
            'alpha': [1.0, 1.5, 1.7],
            'fit_prior': [True, False]
        },
        'Logistic Regression': {
            'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1.0, 10]
        },
        'Random Forest': {
            'n_estimators': [100, 150, 200],
            'max_depth': [20, 30, 40]
        },
        'SVM': {
            'C': [0.01, 0.1, 1.0, 10],
            'kernel': ['rbf', 'linear']
        },
        'XGBoost': {
            'learning_rate': [0.01, 0.1, 0.5],
            'n_estimators': [50, 100, 150],
            'max_depth': [3, 5, 7]
        }
    }

    models = {
        'MultinomialNB': MultinomialNB(),
        'Logistic Regression': LogisticRegression(max_iter=3000, solver='saga', n_jobs=-1, multi_class='multinomial'),
        'Random Forest': RandomForestClassifier(n_jobs=-1),
        'SVM': SVC(),
        'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
    }

    for model_name, model in models.items():
        grid_search = GridSearchCV(model, param_grids[model_name], cv=5, n_jobs=-1)
        grid_search.fit(X_train_sample, y_train_sample)
        best_model = grid_search.best_estimator_

        y_train_pred = best_model.predict(X_train_combined)
        y_pred = best_model.predict(X_test_combined)

        train_accuracy = accuracy_score(y_train, y_train_pred)
        validation_accuracy = accuracy_score(y_test, y_pred)
        print(f"--- {model_name} ({label}) ---")
        print("Best Hyperparameters:", grid_search.best_params_)
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))
        print("Training Accuracy:", train_accuracy)
        print("Validation Accuracy:", validation_accuracy)
        class_mapping = {original: encoded for original, encoded in zip(label_encoder.classes_, range(len(label_encoder.classes_)))}
        print('Class Mapping:', class_mapping)

# Prepare data and labels for each target
X = data.drop(['Gender', 'Occupation', 'Estimated Age Group'], axis=1)
y_gender = data['Gender']
y_occupation = data['Occupation']
y_age = data['Estimated Age Group']

# Train and evaluate for Gender
train_and_evaluate(X, y_gender, label='Gender')

# Train and evaluate for Occupation
train_and_evaluate(X, y_occupation, label='Occupation')

# Train and evaluate for Age
train_and_evaluate(X, y_age, label='Age')
