# Extracting features out of Posts, Comments, and Reactions

In [1]:
import pandas as pd
import ast
import re

In [2]:
filtered_open_data_df = pd.read_csv(r"..\Final_Company_Dataset\filtered_open_data_df2.csv")
final_combined_df = filtered_open_data_df

In [3]:
# Columns to convert from strings to lists
columns_to_convert = ["posts", "comments", "reactions"]

for column in columns_to_convert:
    final_combined_df[column] = final_combined_df[column].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
    )

## 1.Extracting Numerical Features and extracting Posts, Comments, and Reactions Texts

### Ectracting features out of Posts

In [4]:
# Function to convert time strings to days
def time_to_days(time_str):
    """
    Converts time strings like '1 year', '2 months', or '3 days' into equivalent days.

    Parameters:
    time_str (str): A string representing time duration.

    Returns:
    int or None: The equivalent number of days, or None if no valid format is found.
    """
    time_str = time_str.strip().lower()
    if "year" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 365
    elif "month" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 30
    elif "week" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 7
    elif "day" in time_str:
        return int(re.search(r"\d+", time_str).group())
    elif "hour" in time_str:
        return 1
    return None

# Function to extract post details
def extract_post_details(post):
    """
    Extracts engagement metrics (number of reactions, comments, and reposts) from a post.

    Parameters:
    post (str): The post content.

    Returns:
    tuple: A tuple with the number of engagements, comments, and reposts.
    """
    main_number_match = re.search(r"\n(\d+)\n", post)
    main_number = int(main_number_match.group(1)) if main_number_match else 0

    comments_match = re.search(r"(\d+)\s(comment|comments)", post, re.IGNORECASE)
    comments_number = int(comments_match.group(1)) if comments_match else 0

    reposts_match = re.search(r"(\d+)\s(repost|reposts)", post, re.IGNORECASE)
    reposts_number = int(reposts_match.group(1)) if reposts_match else 0

    return main_number, comments_number, reposts_number

# Function to clean posts and extract all relevant information
def clean_posts_and_extract_details(posts):
    """
    Cleans posts, counts reposts, extracts time and engagement metrics.

    Parameters:
    posts (list): A list of post strings.

    Returns:
    tuple: Contains cleaned posts, repost count, post times, engagements, comments, and reposts.
    """
    if not isinstance(posts, list):
        posts = []

    cleaned_posts = []
    repost_count = 0
    post_times = []
    engagements = []
    comments = []
    reposts = []

    for post in posts:
        if not isinstance(post, str):
            continue

        # Extract cleaned post content
        match = re.search(r"Follow\n(.*?)(?:\n\d+\n|\n\d+\scomment|Like|Comment|Repost|Send)", post, re.IGNORECASE | re.DOTALL)
        if match:
            content = match.group(1).strip()
            content = re.split(r"(\n…more|\nActivate to view larger image|\nDetails|\nNABIS|• 3rd+)", content, flags=re.IGNORECASE)[0]
            content = re.sub(r"…more", "", content).strip()
            cleaned_posts.append(content)

            # Extract engagement metrics
            main_number, comments_number, reposts_number = extract_post_details(post)
            engagements.append(main_number)
            comments.append(comments_number)
            reposts.append(reposts_number)

        # Extract time information
        time_match = re.search(r"(\d+\s*(?:year|month|week|day)s?)\s*ago", post, re.IGNORECASE)
        if time_match:
            time_in_days = time_to_days(time_match.group(1))
            if time_in_days:
                post_times.append(time_in_days)

    return cleaned_posts, repost_count, post_times, engagements, comments, reposts

# Apply the cleaning and extraction functions to the DataFrame
def process_dataframe(df):
    """
    Processes the DataFrame to clean posts and extract engagement metrics.

    Parameters:
    df (pd.DataFrame): The original DataFrame containing a 'posts' column.

    Returns:
    pd.DataFrame: A new DataFrame with additional columns for cleaned posts and metrics.
    """
    cleaned_data = []

    for _, row in df.iterrows():
        posts = row.get("posts", [])

        # Clean posts and extract details
        (cleaned_posts, repost_count, post_times,
         engagements, comments, reposts) = clean_posts_and_extract_details(posts)

        # Limit metrics to only cleaned posts
        valid_engagements = engagements[:len(cleaned_posts)]
        valid_comments = comments[:len(cleaned_posts)]
        valid_reposts = reposts[:len(cleaned_posts)]

        cleaned_data.append({
            "cleaned_posts": cleaned_posts,
            "num_posts": len(cleaned_posts),
            "num_reposted_posts": repost_count,
            "post_times": post_times,
            "engagements_posts": valid_engagements,
            "comments_on_posts": valid_comments,
            "reposts_number_posts": valid_reposts
        })

    cleaned_df = pd.DataFrame(cleaned_data)
    return pd.concat([df.reset_index(drop=True), cleaned_df], axis=1)

# Process the DataFrame
final_combined_df = process_dataframe(final_combined_df)

# Display the updated DataFrame
final_combined_df


Unnamed: 0.1,Unnamed: 0,Organization Name,username,about,followers,posts,comments,reactions,experience,education,...,equity_rounds_raised_after_5_years,Equity_raised_until_now,Success_until_now,cleaned_posts,num_posts,num_reposted_posts,post_times,engagements_posts,comments_on_posts,reposts_number_posts
0,0,Borrow,jaguzik,I build solutions for a cleaner future. Fueled...,"4,071 followers",[Feed post number 1\nJon Alain Guzik\nJon Alai...,[Feed post number 1\nJon Alain Guzik commented...,[Feed post number 1\nJon Alain Guzik likes thi...,"[{'title': 'Founder', 'company_name': 'Charge ...","[{'degree': 'MFA, Critical Studies, Critical S...",...,0,0.0,0,[Anyone else hear the buzzing around potential...,23,0,"[240, 270, 365, 365, 365, 365, 365, 365, 365, ...","[5, 6, 27, 20, 26, 10, 69, 2, 10, 5, 12, 4, 18...","[0, 2, 1, 0, 1, 0, 11, 0, 0, 0, 1, 0, 0, 1, 0,...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,WalletCard,naveennand,"I am a Senior Product Manager at Highspot, a B...","10,498 followers",[Feed post number 1\nNaveen Nand\nNaveen Nand\...,[Feed post number 1\nNaveen Nand commented on ...,[Feed post number 1\nNaveen Nand likes this\nR...,"[{'title': 'Senior Product Manager, Platform',...",[],...,1,0.0,0,"[At Highspot, we’re currently looking for an e...",16,0,"[730, 730, 730, 730, 730, 1460, 1460, 1825, 21...","[14, 6, 12, 3, 4, 6, 6, 1, 128, 4, 6, 6, 0, 3,...","[0, 0, 4, 0, 1, 0, 0, 1, 11, 0, 0, 0, 0, 0, 2, 0]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
2,3,WalletCard,britwhitmore,"Passionate, creative, energetic, and always le...","5,486 followers",[Feed post number 1\nBrittany Whitmore reposte...,[Feed post number 1\nBrittany Whitmore comment...,[Feed post number 1\nBrittany Whitmore celebra...,"[{'title': 'CEO & Professional Noisemaker', 'c...",[{'degree': 'Bachelor of Business Administrati...,...,1,0.0,0,[I'm thrilled to share that our 8th annual Fou...,15,0,"[30, 60, 60, 60, 90, 90, 120, 150, 150, 150, 2...","[37, 20, 30, 30, 41, 56, 38, 2, 3, 19, 581, 59...","[16, 1, 1, 1, 2, 14, 6, 0, 0, 1, 76, 1, 4, 0, 5]","[3, 1, 1, 1, 0, 6, 17, 0, 0, 0, 102, 2, 2, 1, 1]"
3,4,WalletCard,timothy-murphy-aa720b7,"Tim is an experienced entrepreneur, business e...",878 followers,[Feed post number 1\nTimothy Murphy\nTimothy M...,[Feed post number 1\nTimothy Murphy commented ...,[Feed post number 1\nTimothy Murphy likes this...,"[{'title': 'Founding Partner', 'company_name':...","[{'degree': 'LL.M, International Law', 'instit...",...,1,0.0,0,[This is a staggering number. Almost 1/3 of th...,17,0,"[1825, 1825, 1825, 1825, 1825, 1825, 1825, 182...","[4, 12, 2, 2, 18, 14, 6, 8, 12, 1, 1, 1, 5, 4,...","[0, 3, 1, 0, 2, 3, 1, 0, 3, 0, 0, 1, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,A3 Surfaces inc,myriam-auclair-gilbert-5a030852,,602 followers,[Feed post number 1\nMyriam Auclair-Gilbert re...,[Feed post number 1\nMyriam Auclair-Gilbert co...,[Feed post number 1\nMyriam Auclair-Gilbert ce...,[{'title': 'Surintendante des opérations du co...,"[{'degree': ""Baccalauréat en Ingénierie de l'a...",...,1,0.0,0,[🌍 Rejoignez l’aventure minière avec \nhashtag...,13,0,"[14, 120, 365, 365, 365, 365, 365, 730, 730, 7...","[33, 15, 20, 37, 19, 42, 25, 25, 64, 20, 86, 3...","[10, 2, 0, 4, 2, 1, 3, 0, 0, 0, 18, 0, 0]","[5, 11, 3, 6, 3, 7, 3, 0, 18, 4, 0, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6480,7741,Evvnt,markunsworth,,584 followers,[Feed post number 1\nMark Unsworth\nMark Unswo...,[],[Feed post number 1\nMark Unsworth likes this\...,"[{'title': 'CPTO', 'company_name': 'Evvnt', 'c...","[{'degree': 'BSc, Information Systems', 'insti...",...,1,0.0,0,"[Jon Kell\nJon Kell, In 2017 4382 men in the U...",3,0,"[1460, 1825, 4015]","[7, 9, 1]","[1, 0, 0]","[0, 0, 0]"
6481,7742,Evvnt,russelledens,Senior Chief Technology Officer with expertise...,511 followers,[Feed post number 1\nRussell Edens\nRussell Ed...,[Feed post number 1\nRussell’s profile photo\n...,[Feed post number 1\nRussell’s profile photo\n...,"[{'title': 'CTO', 'company_name': 'Evvnt', 'co...","[{'degree': 'BS, Mathematics, Computer Science...",...,1,0.0,0,[https://lnkd.in/erm2Hcn\nClean Code\nvoomify....,2,0,"[3285, 3285, 3285, 3285, 3285, 3285]","[1, 2]","[0, 0]","[0, 0]"
6482,7743,Mymeds&Me,dianaalvarezco,My philosophy: we all can learn from anybody b...,"1,186 followers",[Feed post number 1\nDiana Álvarez Concepción\...,[Feed post number 1\nDiana Álvarez Concepción ...,[Feed post number 1\nDiana Álvarez Concepción ...,"[{'title': 'Software Engineer', 'company_name'...","[{'degree': 'Web Development Bootcamp, Full-st...",...,0,0.0,0,[I will never be bored with this brilliant spe...,16,0,"[2920, 2920, 2920, 2920, 2920, 2920, 2920, 292...","[2, 4, 3, 2, 15, 5, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6483,7744,Invisalert Solutions,jasonkadelski,,"1,790 followers",[Feed post number 1\nJason Kadelski\nJason Kad...,[Feed post number 1\nJason Kadelski commented ...,[Feed post number 1\nJason Kadelski likes this...,"[{'title': 'Co-Founder', 'company_name': 'Inte...","[{'degree': 'Master of Science - MS, Finance a...",...,1,0.0,0,[Tony Hawk: Accountants help make dreams becom...,4,0,"[2555, 2555, 2555, 2555]","[7, 4, 3, 22]","[0, 0, 0, 4]","[0, 0, 0, 4]"


In [5]:
# Step 1: Drop duplicate rows based on the 'username' column
final_combined_df = final_combined_df.drop_duplicates(subset=['username'])
# This ensures that only one row per unique 'username' remains in the DataFrame.
# Duplicates are identified based on the 'username' column, and only the first occurrence is kept.

# Step 2: Remove columns with duplicate names
final_combined_df = final_combined_df.loc[:, ~final_combined_df.columns.duplicated()]
# `final_combined_df.columns.duplicated()` identifies duplicate column names.
# `~` negates the condition to retain only unique column names.
# This step ensures that the DataFrame has no columns with the same name.

# Step 3: Count the number of NaN (missing) values in each column
nan_counts = final_combined_df.isna().sum()
# `isna()` identifies NaN values in the DataFrame.
# `sum()` aggregates the count of NaN values for each column.

# Step 4: Display the counts of NaN values
print(nan_counts)
# Outputs a summary of NaN counts for each column in the DataFrame.



Unnamed: 0                              0
Organization Name                       0
username                                0
about                                   0
followers                               0
posts                                 178
comments                              178
reactions                             178
experience                              0
education                               0
Founded Date                            0
equity_raised_in_5_years                0
Success_after_5_years                   0
equity_rounds_raised_after_5_years      0
Equity_raised_until_now                 0
Success_until_now                       0
cleaned_posts                           0
num_posts                               0
num_reposted_posts                      0
post_times                              0
engagements_posts                       0
comments_on_posts                       0
reposts_number_posts                    0
dtype: int64

In [6]:
# Reset the index of the DataFrame
final_combined_df = final_combined_df.reset_index(drop=True)
final_combined_df

Unnamed: 0.1,Unnamed: 0,Organization Name,username,about,followers,posts,comments,reactions,experience,education,...,equity_rounds_raised_after_5_years,Equity_raised_until_now,Success_until_now,cleaned_posts,num_posts,num_reposted_posts,post_times,engagements_posts,comments_on_posts,reposts_number_posts
0,0,Borrow,jaguzik,I build solutions for a cleaner future. Fueled...,"4,071 followers",[Feed post number 1\nJon Alain Guzik\nJon Alai...,[Feed post number 1\nJon Alain Guzik commented...,[Feed post number 1\nJon Alain Guzik likes thi...,"[{'title': 'Founder', 'company_name': 'Charge ...","[{'degree': 'MFA, Critical Studies, Critical S...",...,0,0.0,0,[Anyone else hear the buzzing around potential...,23,0,"[240, 270, 365, 365, 365, 365, 365, 365, 365, ...","[5, 6, 27, 20, 26, 10, 69, 2, 10, 5, 12, 4, 18...","[0, 2, 1, 0, 1, 0, 11, 0, 0, 0, 1, 0, 0, 1, 0,...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,WalletCard,naveennand,"I am a Senior Product Manager at Highspot, a B...","10,498 followers",[Feed post number 1\nNaveen Nand\nNaveen Nand\...,[Feed post number 1\nNaveen Nand commented on ...,[Feed post number 1\nNaveen Nand likes this\nR...,"[{'title': 'Senior Product Manager, Platform',...",[],...,1,0.0,0,"[At Highspot, we’re currently looking for an e...",16,0,"[730, 730, 730, 730, 730, 1460, 1460, 1825, 21...","[14, 6, 12, 3, 4, 6, 6, 1, 128, 4, 6, 6, 0, 3,...","[0, 0, 4, 0, 1, 0, 0, 1, 11, 0, 0, 0, 0, 0, 2, 0]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
2,3,WalletCard,britwhitmore,"Passionate, creative, energetic, and always le...","5,486 followers",[Feed post number 1\nBrittany Whitmore reposte...,[Feed post number 1\nBrittany Whitmore comment...,[Feed post number 1\nBrittany Whitmore celebra...,"[{'title': 'CEO & Professional Noisemaker', 'c...",[{'degree': 'Bachelor of Business Administrati...,...,1,0.0,0,[I'm thrilled to share that our 8th annual Fou...,15,0,"[30, 60, 60, 60, 90, 90, 120, 150, 150, 150, 2...","[37, 20, 30, 30, 41, 56, 38, 2, 3, 19, 581, 59...","[16, 1, 1, 1, 2, 14, 6, 0, 0, 1, 76, 1, 4, 0, 5]","[3, 1, 1, 1, 0, 6, 17, 0, 0, 0, 102, 2, 2, 1, 1]"
3,4,WalletCard,timothy-murphy-aa720b7,"Tim is an experienced entrepreneur, business e...",878 followers,[Feed post number 1\nTimothy Murphy\nTimothy M...,[Feed post number 1\nTimothy Murphy commented ...,[Feed post number 1\nTimothy Murphy likes this...,"[{'title': 'Founding Partner', 'company_name':...","[{'degree': 'LL.M, International Law', 'instit...",...,1,0.0,0,[This is a staggering number. Almost 1/3 of th...,17,0,"[1825, 1825, 1825, 1825, 1825, 1825, 1825, 182...","[4, 12, 2, 2, 18, 14, 6, 8, 12, 1, 1, 1, 5, 4,...","[0, 3, 1, 0, 2, 3, 1, 0, 3, 0, 0, 1, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,A3 Surfaces inc,myriam-auclair-gilbert-5a030852,,602 followers,[Feed post number 1\nMyriam Auclair-Gilbert re...,[Feed post number 1\nMyriam Auclair-Gilbert co...,[Feed post number 1\nMyriam Auclair-Gilbert ce...,[{'title': 'Surintendante des opérations du co...,"[{'degree': ""Baccalauréat en Ingénierie de l'a...",...,1,0.0,0,[🌍 Rejoignez l’aventure minière avec \nhashtag...,13,0,"[14, 120, 365, 365, 365, 365, 365, 730, 730, 7...","[33, 15, 20, 37, 19, 42, 25, 25, 64, 20, 86, 3...","[10, 2, 0, 4, 2, 1, 3, 0, 0, 0, 18, 0, 0]","[5, 11, 3, 6, 3, 7, 3, 0, 18, 4, 0, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5968,7741,Evvnt,markunsworth,,584 followers,[Feed post number 1\nMark Unsworth\nMark Unswo...,[],[Feed post number 1\nMark Unsworth likes this\...,"[{'title': 'CPTO', 'company_name': 'Evvnt', 'c...","[{'degree': 'BSc, Information Systems', 'insti...",...,1,0.0,0,"[Jon Kell\nJon Kell, In 2017 4382 men in the U...",3,0,"[1460, 1825, 4015]","[7, 9, 1]","[1, 0, 0]","[0, 0, 0]"
5969,7742,Evvnt,russelledens,Senior Chief Technology Officer with expertise...,511 followers,[Feed post number 1\nRussell Edens\nRussell Ed...,[Feed post number 1\nRussell’s profile photo\n...,[Feed post number 1\nRussell’s profile photo\n...,"[{'title': 'CTO', 'company_name': 'Evvnt', 'co...","[{'degree': 'BS, Mathematics, Computer Science...",...,1,0.0,0,[https://lnkd.in/erm2Hcn\nClean Code\nvoomify....,2,0,"[3285, 3285, 3285, 3285, 3285, 3285]","[1, 2]","[0, 0]","[0, 0]"
5970,7743,Mymeds&Me,dianaalvarezco,My philosophy: we all can learn from anybody b...,"1,186 followers",[Feed post number 1\nDiana Álvarez Concepción\...,[Feed post number 1\nDiana Álvarez Concepción ...,[Feed post number 1\nDiana Álvarez Concepción ...,"[{'title': 'Software Engineer', 'company_name'...","[{'degree': 'Web Development Bootcamp, Full-st...",...,0,0.0,0,[I will never be bored with this brilliant spe...,16,0,"[2920, 2920, 2920, 2920, 2920, 2920, 2920, 292...","[2, 4, 3, 2, 15, 5, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5971,7744,Invisalert Solutions,jasonkadelski,,"1,790 followers",[Feed post number 1\nJason Kadelski\nJason Kad...,[Feed post number 1\nJason Kadelski commented ...,[Feed post number 1\nJason Kadelski likes this...,"[{'title': 'Co-Founder', 'company_name': 'Inte...","[{'degree': 'Master of Science - MS, Finance a...",...,1,0.0,0,[Tony Hawk: Accountants help make dreams becom...,4,0,"[2555, 2555, 2555, 2555]","[7, 4, 3, 22]","[0, 0, 0, 4]","[0, 0, 0, 4]"


### Extracting Features out of Comments

In [8]:
# Function to convert time strings to days
def time_to_days(time_str):
    """
    Converts time strings like '6mo', '2d' into equivalent days.

    Parameters:
    time_str (str): A string representing time duration.

    Returns:
    int or None: Number of days or None if no valid format is found.
    """
    if not time_str:
        return None
    time_map = {"y": 365, "mo": 30, "w": 7, "d": 1, "h": 1 / 24, "m": 1 / 1440, "s": 1 / 86400}
    match = re.match(r"(\d+)([a-z]+)", time_str.lower())
    if match:
        value, unit = match.groups()
        return int(value) * time_map.get(unit, 0)
    return None

# Function to extract comments and engagement details
def extract_comments(post, account_name):
    """
    Extracts comments, their times in days, and engagement metrics from a post.

    Parameters:
    post (str): The post content.
    account_name (str): The account name to filter comments.

    Returns:
    tuple: Contains lists for comments, times in days, engagements, comments count, and reposts count.
    """
    comments = []
    times_in_days = []
    engagements = []
    comments_count = []
    reposts_count = []

    # Define patterns for matching
    patterns = [
        rf"{re.escape(account_name)}(?:\s+\w+)?\n\s*•\s*3rd\+\n.*?\b(\d+[a-z]+)\b\n(.+?)(?:\n(?:Like|Reply|Collapse replies|Load more comments))",
        rf"{re.escape(account_name)}(?:,\s*[\w\.]+)*\s*\n\s*Author\n.*?\b(\d+[a-z]+)\b\n(.+?)(?:\n(?:Like|Reply|Collapse replies|Load more comments))",
    ]

    for pattern in patterns:
        matches = re.finditer(pattern, post, re.IGNORECASE | re.DOTALL)
        for match in matches:
            # Extract time and convert to days
            time_str = match.group(1)
            time_in_days = time_to_days(time_str)
            if time_in_days is not None:
                times_in_days.append(time_in_days)

            # Extract comment text
            comment = match.group(2).strip()
            comments.append(comment)

            # Extract engagement metrics for the cleaned post
            main_number, comment_count, repost_count = extract_post_details(post)
            engagements.append(main_number)
            comments_count.append(comment_count)
            reposts_count.append(repost_count)

    return comments, times_in_days, engagements, comments_count, reposts_count

# Function to extract account name
def extract_account_name(post):
    """
    Extracts the account name based on patterns in the post content.

    Parameters:
    post (str): The post content.

    Returns:
    str or None: Extracted account name or None if not found.
    """
    match = re.search(r"s profile photo\n(.*?) commented on this", post, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    else:
        match = re.search(r"Feed post number \d+\n(.*?)(?:\sreplied|\scommented)", post, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    return None

# Function to process a list of comments
def process_comments_list(comments_list):
    """
    Processes a list of comments, extracting cleaned comments and engagement metrics.

    Parameters:
    comments_list (list): A list of posts containing comments.

    Returns:
    tuple: Contains lists of cleaned comments, times, engagements, comments count, reposts count, and account name.
    """
    cleaned_comments = []
    days_ago_list = []
    engagements = []
    comments_counts = []
    reposts_counts = []
    account_name = None  # Default to None if no account name is found

    for post in comments_list:
        if not post:
            continue

        # Extract account name
        if account_name is None:
            account_name = extract_account_name(post)

        # Extract comments and details if account name is found
        if account_name:
            (comments, times_in_days, post_engagements,
             post_comments, post_reposts) = extract_comments(post, account_name)
            cleaned_comments.extend(comments)
            days_ago_list.extend(times_in_days)
            engagements.extend(post_engagements)
            comments_counts.extend(post_comments)
            reposts_counts.extend(post_reposts)

    return cleaned_comments, days_ago_list, engagements, comments_counts, reposts_counts, account_name

# Function to process a single row in the DataFrame
def process_row(row):
    """
    Processes a DataFrame row to extract and clean comments and metrics.

    Parameters:
    row (pd.Series): A row from the DataFrame.

    Returns:
    pd.Series: A Series with additional columns for cleaned comments and metrics.
    """
    try:
        comments_list = row.get("comments", [])
        if isinstance(comments_list, list):
            (cleaned_comments, days_ago, engagements,
             comments_counts, reposts_counts, account_name) = process_comments_list(comments_list)
            return pd.Series({
                "cleaned_comments": cleaned_comments,
                "num_comments": len(cleaned_comments),
                "comment_days_ago": days_ago,
                "engagements_comments": engagements,
                "comments_on_comments": comments_counts,
                "reposts_number_comments": reposts_counts,
                "account_name": account_name
            })
        return pd.Series({
            "cleaned_comments": [],
            "num_comments": 0,
            "comment_days_ago": [],
            "engagements_comments": [],
            "comments_on_comments": [],
            "reposts_number_comments": [],
            "account_name": None
        })
    except Exception as e:
        return pd.Series({
            "cleaned_comments": [],
            "num_comments": 0,
            "comment_days_ago": [],
            "engagements_comments": [],
            "comments_on_comments": [],
            "reposts_number_comments": [],
            "account_name": None
        })

# Apply the processing function to the DataFrame
processed_columns = final_combined_df.apply(process_row, axis=1)
final_combined_df = pd.concat([final_combined_df, processed_columns], axis=1)

# Display the updated DataFrame
final_combined_df


Unnamed: 0.1,Unnamed: 0,Organization Name,username,about,followers,posts,comments,reactions,experience,education,...,engagements_posts,comments_on_posts,reposts_number_posts,cleaned_comments,num_comments,comment_days_ago,engagements_comments,comments_on_comments,reposts_number_comments,account_name
0,0,Borrow,jaguzik,I build solutions for a cleaner future. Fueled...,"4,071 followers",[Feed post number 1\nJon Alain Guzik\nJon Alai...,[Feed post number 1\nJon Alain Guzik commented...,[Feed post number 1\nJon Alain Guzik likes thi...,"[{'title': 'Founder', 'company_name': 'Charge ...","[{'degree': 'MFA, Critical Studies, Critical S...",...,"[5, 6, 27, 20, 26, 10, 69, 2, 10, 5, 12, 4, 18...","[0, 2, 1, 0, 1, 0, 11, 0, 0, 0, 1, 0, 0, 1, 0,...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",[Brittni Karol This is right up Charge Collect...,10,"[120, 120, 180, 180, 210, 240, 240, 240, 330, ...","[18, 294, 128, 362, 37, 190, 6, 6, 122, 16]","[2, 45, 34, 235, 16, 39, 2, 2, 99, 154]","[3, 1, 0, 0, 0, 7, 0, 0, 5, 4]",Jon Alain Guzik
1,2,WalletCard,naveennand,"I am a Senior Product Manager at Highspot, a B...","10,498 followers",[Feed post number 1\nNaveen Nand\nNaveen Nand\...,[Feed post number 1\nNaveen Nand commented on ...,[Feed post number 1\nNaveen Nand likes this\nR...,"[{'title': 'Senior Product Manager, Platform',...",[],...,"[14, 6, 12, 3, 4, 6, 6, 1, 128, 4, 6, 6, 0, 3,...","[0, 0, 4, 0, 1, 0, 0, 1, 11, 0, 0, 0, 0, 0, 2, 0]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[Congrats Natalie! So excited for you!!!, Cong...",11,"[90, 365, 730, 730, 1460, 1825, 1825, 1825, 18...","[309, 251, 556, 556, 47, 27, 183, 70, 128, 128...","[132, 64, 170, 170, 18, 2, 37, 185, 11, 11, 78]","[0, 0, 4, 4, 0, 3, 0, 0, 0, 0, 136]",Naveen Nand
2,3,WalletCard,britwhitmore,"Passionate, creative, energetic, and always le...","5,486 followers",[Feed post number 1\nBrittany Whitmore reposte...,[Feed post number 1\nBrittany Whitmore comment...,[Feed post number 1\nBrittany Whitmore celebra...,"[{'title': 'CEO & Professional Noisemaker', 'c...",[{'degree': 'Bachelor of Business Administrati...,...,"[37, 20, 30, 30, 41, 56, 38, 2, 3, 19, 581, 59...","[16, 1, 1, 1, 2, 14, 6, 0, 0, 1, 76, 1, 4, 0, 5]","[3, 1, 1, 1, 0, 6, 17, 0, 0, 0, 102, 2, 2, 1, 1]","[YES! Always show instead of tell! 🙌🏻, Sean Co...",10,"[60, 60, 60, 60, 60, 90, 90, 120, 120, 150]","[44, 18, 55, 25, 41, 36, 80, 381, 131, 148]","[10, 15, 7, 16, 1, 12, 2, 110, 33, 17]","[1, 0, 0, 1, 1, 0, 0, 5, 12, 10]",Brittany Whitmore
3,4,WalletCard,timothy-murphy-aa720b7,"Tim is an experienced entrepreneur, business e...",878 followers,[Feed post number 1\nTimothy Murphy\nTimothy M...,[Feed post number 1\nTimothy Murphy commented ...,[Feed post number 1\nTimothy Murphy likes this...,"[{'title': 'Founding Partner', 'company_name':...","[{'degree': 'LL.M, International Law', 'instit...",...,"[4, 12, 2, 2, 18, 14, 6, 8, 12, 1, 1, 1, 5, 4,...","[0, 3, 1, 0, 2, 3, 1, 0, 3, 0, 0, 1, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Congrats Victor!, Vinod Varma looking good bu...",9,"[7, 210, 240, 365, 730, 730, 730, 730, 730]","[270, 41, 307, 51, 794, 31, 0, 102, 128]","[82, 4, 123, 3, 300, 3, 60, 8, 19]","[4, 2, 0, 1, 10, 0, 9, 1, 0]",Timothy Murphy
4,5,A3 Surfaces inc,myriam-auclair-gilbert-5a030852,,602 followers,[Feed post number 1\nMyriam Auclair-Gilbert re...,[Feed post number 1\nMyriam Auclair-Gilbert co...,[Feed post number 1\nMyriam Auclair-Gilbert ce...,[{'title': 'Surintendante des opérations du co...,"[{'degree': ""Baccalauréat en Ingénierie de l'a...",...,"[33, 15, 20, 37, 19, 42, 25, 25, 64, 20, 86, 3...","[10, 2, 0, 4, 2, 1, 3, 0, 0, 0, 18, 0, 0]","[5, 11, 3, 6, 3, 7, 3, 0, 18, 4, 0, 0, 0]","[Félicitations Francis, Bravo!!! 🤩, Bravo! C’e...",10,"[60, 365, 365, 365, 365, 365, 365, 365, 365, 365]","[20, 64, 98, 20, 37, 37, 37, 37, 111, 19]","[11, 22, 52, 2, 4, 4, 4, 4, 14, 2]","[0, 0, 0, 1, 6, 6, 6, 6, 1, 3]",Myriam Auclair-Gilbert
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5968,7741,Evvnt,markunsworth,,584 followers,[Feed post number 1\nMark Unsworth\nMark Unswo...,[],[Feed post number 1\nMark Unsworth likes this\...,"[{'title': 'CPTO', 'company_name': 'Evvnt', 'c...","[{'degree': 'BSc, Information Systems', 'insti...",...,"[7, 9, 1]","[1, 0, 0]","[0, 0, 0]",[],0,[],[],[],[],
5969,7742,Evvnt,russelledens,Senior Chief Technology Officer with expertise...,511 followers,[Feed post number 1\nRussell Edens\nRussell Ed...,[Feed post number 1\nRussell’s profile photo\n...,[Feed post number 1\nRussell’s profile photo\n...,"[{'title': 'CTO', 'company_name': 'Evvnt', 'co...","[{'degree': 'BS, Mathematics, Computer Science...",...,"[1, 2]","[0, 0]","[0, 0]","[Voomify, Voomify, Yes, yes I am. But not with...",3,"[2920, 2920, 2920]","[7, 7, 4]","[1, 1, 1]","[0, 0, 0]",Russell Edens
5970,7743,Mymeds&Me,dianaalvarezco,My philosophy: we all can learn from anybody b...,"1,186 followers",[Feed post number 1\nDiana Álvarez Concepción\...,[Feed post number 1\nDiana Álvarez Concepción ...,[Feed post number 1\nDiana Álvarez Concepción ...,"[{'title': 'Software Engineer', 'company_name'...","[{'degree': 'Web Development Bootcamp, Full-st...",...,"[2, 4, 3, 2, 15, 5, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[Congratulations, Suzanne 👏👏👏, Congratulations...",9,"[90, 1095, 2190, 2190, 2190, 2190, 2190, 2190,...","[44, 405, 44, 78, 30, 30, 3, 0, 8]","[15, 35, 36, 23, 4, 4, 1, 281, 17]","[0, 21, 0, 1, 1, 1, 0, 112, 0]",Diana Álvarez Concepción
5971,7744,Invisalert Solutions,jasonkadelski,,"1,790 followers",[Feed post number 1\nJason Kadelski\nJason Kad...,[Feed post number 1\nJason Kadelski commented ...,[Feed post number 1\nJason Kadelski likes this...,"[{'title': 'Co-Founder', 'company_name': 'Inte...","[{'degree': 'Master of Science - MS, Finance a...",...,"[7, 4, 3, 22]","[0, 0, 0, 4]","[0, 0, 0, 4]","[Congrats!, Congrats Nick!, Nice work James!, ...",6,"[1095, 1460, 2190, 2190, 2190, 2555]","[238, 41, 5, 22, 22, 306]","[41, 24, 1, 4, 4, 47]","[0, 0, 0, 4, 4, 1]",Jason Kadelski


### Extract Features from Reactions

In [9]:
# Function to convert time strings to days
def time_to_days(time_str):
    """
    Converts time strings like '1 year', '2 months', or '3 days' into equivalent days.

    Parameters:
    time_str (str): A string representing time duration.

    Returns:
    int or None: The equivalent number of days, or None if no valid format is found.
    """
    time_str = time_str.strip().lower()
    if "year" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 365
    elif "month" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 30
    elif "week" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 7
    elif "day" in time_str:
        return int(re.search(r"\d+", time_str).group())
    elif "hour" in time_str or "minute" in time_str or "second" in time_str:
        return 1  # Consider less than a day as 1 day
    return None

# Function to extract the reaction type
def extract_reaction(post, account_name):
    """
    Extracts the type of reaction (e.g., "likes this", "finds this insightful") for a given account name.

    Parameters:
    post (str): The post content.
    account_name (str): The account name whose reaction is being extracted.

    Returns:
    str or None: The reaction type or None if no reaction is found.
    """
    if not isinstance(post, str) or not isinstance(account_name, str):
        return None
    match = re.search(rf"{re.escape(account_name)}\s+(likes this|finds this insightful|supports this|comments on this)", post, re.IGNORECASE)
    return match.group(1).strip() if match else None

# Function to extract the post text
def extract_post_text(post):
    """
    Extracts the post text from the content between '\nFollow\n' and the first occurrence of '\n…more' or '\nhashtag'.

    Parameters:
    post (str): The post content.

    Returns:
    str or None: Extracted post text or None if no match is found.
    """
    if not isinstance(post, str):
        return None
    match = re.search(r"\nFollow\n(.*?)(?:\n…more|\nhashtag)", post, re.DOTALL)
    return match.group(1).strip() if match else None

# Function to extract the time in days
def extract_time_in_days(post):
    """
    Extracts the time (e.g., "2 days ago") from the post and converts it into days.

    Parameters:
    post (str): The post content.

    Returns:
    int or None: The equivalent time in days or None if no valid format is found.
    """
    if not isinstance(post, str):
        return None
    match = re.search(r"•?\s*\n\s*(\d+\s*[a-z]+)\s+ago\s+Follow\n", post, re.IGNORECASE)
    return time_to_days(match.group(1).strip()) if match else None

# Function to extract hashtags
def extract_hashtags(post):
    """
    Extracts hashtags from the post content.

    Parameters:
    post (str): The post content.

    Returns:
    list: A list of extracted hashtags without the '#' prefix.
    """
    if not isinstance(post, str):
        return []
    match = re.search(r"\nhashtag\n(#[^\n]+)", post, re.DOTALL)
    if match:
        hashtags = match.group(1).split("\n")
        return [tag.lstrip("#") for tag in hashtags if tag.startswith("#")]
    return []

# Process each row for reactions and related details
def process_row(row):
    """
    Processes a single DataFrame row to extract and clean reactions, hashtags, and times.

    Parameters:
    row (pd.Series): A row from the DataFrame.

    Returns:
    pd.Series: A Series with new columns for cleaned reactions, reaction types, hashtags, etc.
    """
    reactions_list = row.get("reactions", [])
    account_name = row.get("account_name", "")

    if not isinstance(reactions_list, list):
        reactions_list = []

    # Initialize outputs
    cleaned_reactions = []
    reaction_types = []
    hashtag_lists = []
    post_times = []

    # Process each reaction in the list
    for reaction_post in reactions_list:
        if not isinstance(reaction_post, str):
            continue

        post_text = extract_post_text(reaction_post)
        if post_text:  # Process only if post text is extracted
            cleaned_reactions.append(post_text)
            reaction_types.append(extract_reaction(reaction_post, account_name))
            post_times.append(extract_time_in_days(reaction_post))
            hashtag_lists.append(extract_hashtags(reaction_post))

    return pd.Series({
        "cleaned_reactions": cleaned_reactions,
        "reaction_types": reaction_types,
        "hashtag_lists": hashtag_lists,
        "new_num_posts": len(cleaned_reactions),
        "new_post_times": [time for time in post_times if time is not None]
    })

# Apply the processing functions to the DataFrame
temp_result = final_combined_df.apply(process_row, axis=1)
final_combined_df = pd.concat([final_combined_df, temp_result], axis=1)

# Display the updated DataFrame
final_combined_df


Unnamed: 0.1,Unnamed: 0,Organization Name,username,about,followers,posts,comments,reactions,experience,education,...,comment_days_ago,engagements_comments,comments_on_comments,reposts_number_comments,account_name,cleaned_reactions,reaction_types,hashtag_lists,new_num_posts,new_post_times
0,0,Borrow,jaguzik,I build solutions for a cleaner future. Fueled...,"4,071 followers",[Feed post number 1\nJon Alain Guzik\nJon Alai...,[Feed post number 1\nJon Alain Guzik commented...,[Feed post number 1\nJon Alain Guzik likes thi...,"[{'title': 'Founder', 'company_name': 'Charge ...","[{'degree': 'MFA, Critical Studies, Critical S...",...,"[120, 120, 180, 180, 210, 240, 240, 240, 330, ...","[18, 294, 128, 362, 37, 190, 6, 6, 122, 16]","[2, 45, 34, 235, 16, 39, 2, 2, 99, 154]","[3, 1, 0, 0, 0, 7, 0, 0, 5, 4]",Jon Alain Guzik,[Honored to be invited to speak with Edward Ad...,"[likes this, likes this, likes this, likes thi...",[[IstioDay2024. Looking forward to continuing ...,7,"[14, 30, 30, 30, 30, 30, 30]"
1,2,WalletCard,naveennand,"I am a Senior Product Manager at Highspot, a B...","10,498 followers",[Feed post number 1\nNaveen Nand\nNaveen Nand\...,[Feed post number 1\nNaveen Nand commented on ...,[Feed post number 1\nNaveen Nand likes this\nR...,"[{'title': 'Senior Product Manager, Platform',...",[],...,"[90, 365, 730, 730, 1460, 1825, 1825, 1825, 18...","[309, 251, 556, 556, 47, 27, 183, 70, 128, 128...","[132, 64, 170, 170, 18, 2, 37, 185, 11, 11, 78]","[0, 0, 4, 4, 0, 3, 0, 0, 0, 0, 136]",Naveen Nand,[I’m happy to share that this week marked my t...,"[likes this, likes this, likes this, likes thi...","[[], [], [], [], [], [], [], [Spark24, where I...",10,"[7, 14, 14, 14, 14, 30, 30, 30, 30, 30]"
2,3,WalletCard,britwhitmore,"Passionate, creative, energetic, and always le...","5,486 followers",[Feed post number 1\nBrittany Whitmore reposte...,[Feed post number 1\nBrittany Whitmore comment...,[Feed post number 1\nBrittany Whitmore celebra...,"[{'title': 'CEO & Professional Noisemaker', 'c...",[{'degree': 'Bachelor of Business Administrati...,...,"[60, 60, 60, 60, 60, 90, 90, 120, 120, 150]","[44, 18, 55, 25, 41, 36, 80, 381, 131, 148]","[10, 15, 7, 16, 1, 12, 2, 110, 33, 17]","[1, 0, 0, 1, 1, 0, 0, 5, 12, 10]",Brittany Whitmore,[What’s the coolest thing about being quote tw...,"[None, None, None, None, None, None, None, None]","[[], [], [], [], [SaaS and ], [], [], []]",8,"[2, 4, 7, 7, 14, 14, 14, 14]"
3,4,WalletCard,timothy-murphy-aa720b7,"Tim is an experienced entrepreneur, business e...",878 followers,[Feed post number 1\nTimothy Murphy\nTimothy M...,[Feed post number 1\nTimothy Murphy commented ...,[Feed post number 1\nTimothy Murphy likes this...,"[{'title': 'Founding Partner', 'company_name':...","[{'degree': 'LL.M, International Law', 'instit...",...,"[7, 210, 240, 365, 730, 730, 730, 730, 730]","[270, 41, 307, 51, 794, 31, 0, 102, 128]","[82, 4, 123, 3, 300, 3, 60, 8, 19]","[4, 2, 0, 1, 10, 0, 9, 1, 0]",Timothy Murphy,"[Nick Campbell, CFA of Adcore (TSX: ADCO) sat ...","[likes this, likes this, likes this, likes thi...","[[], [], [], [], [], [], [], [challengerbrands...",10,"[30, 30, 30, 30, 30, 30, 30, 30, 30, 60]"
4,5,A3 Surfaces inc,myriam-auclair-gilbert-5a030852,,602 followers,[Feed post number 1\nMyriam Auclair-Gilbert re...,[Feed post number 1\nMyriam Auclair-Gilbert co...,[Feed post number 1\nMyriam Auclair-Gilbert ce...,[{'title': 'Surintendante des opérations du co...,"[{'degree': ""Baccalauréat en Ingénierie de l'a...",...,"[60, 365, 365, 365, 365, 365, 365, 365, 365, 365]","[20, 64, 98, 20, 37, 37, 37, 37, 111, 19]","[11, 22, 52, 2, 4, 4, 4, 4, 14, 2]","[0, 0, 0, 1, 6, 6, 6, 6, 1, 3]",Myriam Auclair-Gilbert,[🎉 Célébration des 20 ans du Développement éco...,"[None, None, None, None, likes this, likes thi...","[[], [MomentImportant], [InaugurationDeLaMineM...",10,"[30, 30, 60, 60, 90, 120, 150, 150, 150, 150]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5968,7741,Evvnt,markunsworth,,584 followers,[Feed post number 1\nMark Unsworth\nMark Unswo...,[],[Feed post number 1\nMark Unsworth likes this\...,"[{'title': 'CPTO', 'company_name': 'Evvnt', 'c...","[{'degree': 'BSc, Information Systems', 'insti...",...,[],[],[],[],,[Calling all American friends - Yoto is now li...,"[None, None, None, None, None, None, None, Non...","[[], [], [], [], [], [], [], [], [], [], [], []]",12,"[30, 90, 150, 150, 180, 180, 270, 365, 365, 36..."
5969,7742,Evvnt,russelledens,Senior Chief Technology Officer with expertise...,511 followers,[Feed post number 1\nRussell Edens\nRussell Ed...,[Feed post number 1\nRussell’s profile photo\n...,[Feed post number 1\nRussell’s profile photo\n...,"[{'title': 'CTO', 'company_name': 'Evvnt', 'co...","[{'degree': 'BS, Mathematics, Computer Science...",...,"[2920, 2920, 2920]","[7, 7, 4]","[1, 1, 1]","[0, 0, 0]",Russell Edens,[Business leaders who adopt the attitude that ...,"[likes this, likes this, likes this, likes this]","[[], [], [], []]",4,"[3285, 3285, 3285, 3285]"
5970,7743,Mymeds&Me,dianaalvarezco,My philosophy: we all can learn from anybody b...,"1,186 followers",[Feed post number 1\nDiana Álvarez Concepción\...,[Feed post number 1\nDiana Álvarez Concepción ...,[Feed post number 1\nDiana Álvarez Concepción ...,"[{'title': 'Software Engineer', 'company_name'...","[{'degree': 'Web Development Bootcamp, Full-st...",...,"[90, 1095, 2190, 2190, 2190, 2190, 2190, 2190,...","[44, 405, 44, 78, 30, 30, 3, 0, 8]","[15, 35, 36, 23, 4, 4, 1, 281, 17]","[0, 21, 0, 1, 1, 1, 0, 112, 0]",Diana Álvarez Concepción,[TRABAJO REMOTO ???\n\nToda la vida te voy a e...,"[None, likes this, likes this, likes this, Non...","[[], [], [], [], [], [], []]",7,"[90, 150, 365, 730, 730, 730, 1095]"
5971,7744,Invisalert Solutions,jasonkadelski,,"1,790 followers",[Feed post number 1\nJason Kadelski\nJason Kad...,[Feed post number 1\nJason Kadelski commented ...,[Feed post number 1\nJason Kadelski likes this...,"[{'title': 'Co-Founder', 'company_name': 'Inte...","[{'degree': 'Master of Science - MS, Finance a...",...,"[1095, 1460, 2190, 2190, 2190, 2555]","[238, 41, 5, 22, 22, 306]","[41, 24, 1, 4, 4, 47]","[0, 0, 0, 4, 4, 1]",Jason Kadelski,[Children's Mercy Kansas City + Camber Mental ...,"[likes this, likes this, likes this, likes thi...","[[], [], [], [], [], [], [], [philadelphiabusi...",10,"[6, 21, 21, 14, 21, 60, 30, 60, 90, 90]"


In [10]:
#final_combined_df.at[2, "Linkedin_url"]

In [11]:
final_combined_df.columns

Index(['Unnamed: 0', 'Organization Name', 'username', 'about', 'followers',
       'posts', 'comments', 'reactions', 'experience', 'education',
       'Founded Date', 'equity_raised_in_5_years', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Equity_raised_until_now',
       'Success_until_now', 'cleaned_posts', 'num_posts', 'num_reposted_posts',
       'post_times', 'engagements_posts', 'comments_on_posts',
       'reposts_number_posts', 'cleaned_comments', 'num_comments',
       'comment_days_ago', 'engagements_comments', 'comments_on_comments',
       'reposts_number_comments', 'account_name', 'cleaned_reactions',
       'reaction_types', 'hashtag_lists', 'new_num_posts', 'new_post_times'],
      dtype='object')

In [12]:
final_combined_df[['Organization Name']].nunique()

Organization Name    1476
dtype: int64

### Extract Number of Followers

In [13]:
# Step 1: Clean the 'followers' column
final_combined_df['followers'] = final_combined_df['followers'] \
    .str.replace('followers', '', regex=False)  # Remove the text "followers"
    .str.replace(',', '', regex=False)          # Remove commas to facilitate numeric conversion
    .astype(int)                                # Convert the cleaned strings to integers

# Step 2: Verify the conversion by displaying the updated DataFrame
print(final_combined_df)  # Display the DataFrame to ensure the 'followers' column is properly converted



Unnamed: 0.1,Unnamed: 0,Organization Name,username,about,followers,posts,comments,reactions,experience,education,...,comment_days_ago,engagements_comments,comments_on_comments,reposts_number_comments,account_name,cleaned_reactions,reaction_types,hashtag_lists,new_num_posts,new_post_times
0,0,Borrow,jaguzik,I build solutions for a cleaner future. Fueled...,4071,[Feed post number 1\nJon Alain Guzik\nJon Alai...,[Feed post number 1\nJon Alain Guzik commented...,[Feed post number 1\nJon Alain Guzik likes thi...,"[{'title': 'Founder', 'company_name': 'Charge ...","[{'degree': 'MFA, Critical Studies, Critical S...",...,"[120, 120, 180, 180, 210, 240, 240, 240, 330, ...","[18, 294, 128, 362, 37, 190, 6, 6, 122, 16]","[2, 45, 34, 235, 16, 39, 2, 2, 99, 154]","[3, 1, 0, 0, 0, 7, 0, 0, 5, 4]",Jon Alain Guzik,[Honored to be invited to speak with Edward Ad...,"[likes this, likes this, likes this, likes thi...",[[IstioDay2024. Looking forward to continuing ...,7,"[14, 30, 30, 30, 30, 30, 30]"
1,2,WalletCard,naveennand,"I am a Senior Product Manager at Highspot, a B...",10498,[Feed post number 1\nNaveen Nand\nNaveen Nand\...,[Feed post number 1\nNaveen Nand commented on ...,[Feed post number 1\nNaveen Nand likes this\nR...,"[{'title': 'Senior Product Manager, Platform',...",[],...,"[90, 365, 730, 730, 1460, 1825, 1825, 1825, 18...","[309, 251, 556, 556, 47, 27, 183, 70, 128, 128...","[132, 64, 170, 170, 18, 2, 37, 185, 11, 11, 78]","[0, 0, 4, 4, 0, 3, 0, 0, 0, 0, 136]",Naveen Nand,[I’m happy to share that this week marked my t...,"[likes this, likes this, likes this, likes thi...","[[], [], [], [], [], [], [], [Spark24, where I...",10,"[7, 14, 14, 14, 14, 30, 30, 30, 30, 30]"
2,3,WalletCard,britwhitmore,"Passionate, creative, energetic, and always le...",5486,[Feed post number 1\nBrittany Whitmore reposte...,[Feed post number 1\nBrittany Whitmore comment...,[Feed post number 1\nBrittany Whitmore celebra...,"[{'title': 'CEO & Professional Noisemaker', 'c...",[{'degree': 'Bachelor of Business Administrati...,...,"[60, 60, 60, 60, 60, 90, 90, 120, 120, 150]","[44, 18, 55, 25, 41, 36, 80, 381, 131, 148]","[10, 15, 7, 16, 1, 12, 2, 110, 33, 17]","[1, 0, 0, 1, 1, 0, 0, 5, 12, 10]",Brittany Whitmore,[What’s the coolest thing about being quote tw...,"[None, None, None, None, None, None, None, None]","[[], [], [], [], [SaaS and ], [], [], []]",8,"[2, 4, 7, 7, 14, 14, 14, 14]"
3,4,WalletCard,timothy-murphy-aa720b7,"Tim is an experienced entrepreneur, business e...",878,[Feed post number 1\nTimothy Murphy\nTimothy M...,[Feed post number 1\nTimothy Murphy commented ...,[Feed post number 1\nTimothy Murphy likes this...,"[{'title': 'Founding Partner', 'company_name':...","[{'degree': 'LL.M, International Law', 'instit...",...,"[7, 210, 240, 365, 730, 730, 730, 730, 730]","[270, 41, 307, 51, 794, 31, 0, 102, 128]","[82, 4, 123, 3, 300, 3, 60, 8, 19]","[4, 2, 0, 1, 10, 0, 9, 1, 0]",Timothy Murphy,"[Nick Campbell, CFA of Adcore (TSX: ADCO) sat ...","[likes this, likes this, likes this, likes thi...","[[], [], [], [], [], [], [], [challengerbrands...",10,"[30, 30, 30, 30, 30, 30, 30, 30, 30, 60]"
4,5,A3 Surfaces inc,myriam-auclair-gilbert-5a030852,,602,[Feed post number 1\nMyriam Auclair-Gilbert re...,[Feed post number 1\nMyriam Auclair-Gilbert co...,[Feed post number 1\nMyriam Auclair-Gilbert ce...,[{'title': 'Surintendante des opérations du co...,"[{'degree': ""Baccalauréat en Ingénierie de l'a...",...,"[60, 365, 365, 365, 365, 365, 365, 365, 365, 365]","[20, 64, 98, 20, 37, 37, 37, 37, 111, 19]","[11, 22, 52, 2, 4, 4, 4, 4, 14, 2]","[0, 0, 0, 1, 6, 6, 6, 6, 1, 3]",Myriam Auclair-Gilbert,[🎉 Célébration des 20 ans du Développement éco...,"[None, None, None, None, likes this, likes thi...","[[], [MomentImportant], [InaugurationDeLaMineM...",10,"[30, 30, 60, 60, 90, 120, 150, 150, 150, 150]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5968,7741,Evvnt,markunsworth,,584,[Feed post number 1\nMark Unsworth\nMark Unswo...,[],[Feed post number 1\nMark Unsworth likes this\...,"[{'title': 'CPTO', 'company_name': 'Evvnt', 'c...","[{'degree': 'BSc, Information Systems', 'insti...",...,[],[],[],[],,[Calling all American friends - Yoto is now li...,"[None, None, None, None, None, None, None, Non...","[[], [], [], [], [], [], [], [], [], [], [], []]",12,"[30, 90, 150, 150, 180, 180, 270, 365, 365, 36..."
5969,7742,Evvnt,russelledens,Senior Chief Technology Officer with expertise...,511,[Feed post number 1\nRussell Edens\nRussell Ed...,[Feed post number 1\nRussell’s profile photo\n...,[Feed post number 1\nRussell’s profile photo\n...,"[{'title': 'CTO', 'company_name': 'Evvnt', 'co...","[{'degree': 'BS, Mathematics, Computer Science...",...,"[2920, 2920, 2920]","[7, 7, 4]","[1, 1, 1]","[0, 0, 0]",Russell Edens,[Business leaders who adopt the attitude that ...,"[likes this, likes this, likes this, likes this]","[[], [], [], []]",4,"[3285, 3285, 3285, 3285]"
5970,7743,Mymeds&Me,dianaalvarezco,My philosophy: we all can learn from anybody b...,1186,[Feed post number 1\nDiana Álvarez Concepción\...,[Feed post number 1\nDiana Álvarez Concepción ...,[Feed post number 1\nDiana Álvarez Concepción ...,"[{'title': 'Software Engineer', 'company_name'...","[{'degree': 'Web Development Bootcamp, Full-st...",...,"[90, 1095, 2190, 2190, 2190, 2190, 2190, 2190,...","[44, 405, 44, 78, 30, 30, 3, 0, 8]","[15, 35, 36, 23, 4, 4, 1, 281, 17]","[0, 21, 0, 1, 1, 1, 0, 112, 0]",Diana Álvarez Concepción,[TRABAJO REMOTO ???\n\nToda la vida te voy a e...,"[None, likes this, likes this, likes this, Non...","[[], [], [], [], [], [], []]",7,"[90, 150, 365, 730, 730, 730, 1095]"
5971,7744,Invisalert Solutions,jasonkadelski,,1790,[Feed post number 1\nJason Kadelski\nJason Kad...,[Feed post number 1\nJason Kadelski commented ...,[Feed post number 1\nJason Kadelski likes this...,"[{'title': 'Co-Founder', 'company_name': 'Inte...","[{'degree': 'Master of Science - MS, Finance a...",...,"[1095, 1460, 2190, 2190, 2190, 2555]","[238, 41, 5, 22, 22, 306]","[41, 24, 1, 4, 4, 47]","[0, 0, 0, 4, 4, 1]",Jason Kadelski,[Children's Mercy Kansas City + Camber Mental ...,"[likes this, likes this, likes this, likes thi...","[[], [], [], [], [], [], [], [philadelphiabusi...",10,"[6, 21, 21, 14, 21, 60, 30, 60, 90, 90]"


In [15]:
final_combined_df_first_half = pd.read_csv(r"..\Final_Company_Dataset\final_combined_df_first_half3.csv")

In [16]:
# Step 1: Filter rows where 'Success_after_5_years' is equal to -1
filtered_data = final_combined_df_first_half[
    final_combined_df_first_half['Success_after_5_years'] == -1
].copy()  # Create a copy to avoid modifying the original DataFrame

# Step 2: Ensure 'cleaned_posts' is treated as a list and check if it's not empty
filtered_data = filtered_data[
    filtered_data['cleaned_posts'].apply(lambda x: isinstance(x, list) and len(x) > 0)
]
# - `apply` is used with a lambda function to check:
#   1. If the value in 'cleaned_posts' is a list (`isinstance(x, list)`).
#   2. If the list is not empty (`len(x) > 0`).

# Step 3: Display the filtered data
print(filtered_data)  # Display the filtered DataFrame to verify results


Unnamed: 0,Organization Name,username,about,followers,posts,comments,reactions,experience,education,Founded Date,...,cleaned_comments,num_comments,comment_days_ago,engagements_comments,comments_on_comments,reposts_number_comments,account_name,cleaned_reactions,reaction_types,reaction_times


In [17]:
final_combined_df_first_half[final_combined_df_first_half['Success_after_5_years']==-1][['Success_after_5_years', "cleaned_posts"]].loc

<pandas.core.indexing._LocIndexer at 0x23a028aba20>

In [18]:
final_combined_df_first_half[['Success_after_5_years', "cleaned_posts"]].groupby('Success_after_5_years').count()

Unnamed: 0_level_0,cleaned_posts
Success_after_5_years,Unnamed: 1_level_1
-1,1450
0,13484
1,973


In [19]:
#äinportr pandas# To make the code stop running here 

In [20]:
final_combined_df_first_half

Unnamed: 0,Organization Name,username,about,followers,posts,comments,reactions,experience,education,Founded Date,...,cleaned_comments,num_comments,comment_days_ago,engagements_comments,comments_on_comments,reposts_number_comments,account_name,cleaned_reactions,reaction_types,reaction_times
0,ORA Graphene Audio Inc.,ari-pinkas-88913811,Ari Pinkas is Co-founder and VP Business Devel...,1543,[],[],[],"[{'title': 'Co-Founder', 'company_name': 'ORA ...","[{'degree': 'BComm, Marketing', 'institution':...",2016,...,[],0,[],[],[],[],,[],[],[]
1,ORA Graphene Audio Inc.,kaiwen-hu-ph-d-a32a0946,- PhD in Materials Engineering. Extensive Expe...,725,"['Feed post number 1\nKaiwen Hu, Ph.D\nKaiwen ...","['Feed post number 1\nKaiwen Hu, Ph.D commente...",[],"[{'title': 'Co-founder VP Research', 'company_...","[{'degree': 'Doctor of Philosophy (Ph.D.), Mat...",2016,...,['Best of luck Rune!'],1,[365],[82],[19],[0],"Kaiwen Hu, Ph.D",[],[],[]
2,ORA Graphene Audio Inc.,michaelkraft,CEO / President / Independent Board Director /...,3729,['Feed post number 1\nMichael Kraft\nMichael K...,"[""Feed post number 1\nMichael Kraft replied to...",['Feed post number 1\nMichael Kraft likes this...,"[{'title': 'Chief Executive Officer', 'company...","[{'degree': 'ExecEd, Technology Marketing & St...",2016,...,['CONGRATS on the CEO role! We may need to tal...,10,"[90, 90, 90, 90, 90, 90, 90, 90, 90, 90]","[28, 71, 28, 68, 243, 85, 66, 108]","[21, 6, 21, 11, 116, 7, 61, 4]","[0, 1, 0, 8, 0, 3, 1, 2]",Michael Kraft,"['Over the past week, I\'ve heard the same thi...","['likes this', 'likes this', 'likes this', 'fi...","[7, 7, 60, 60, 120, 180, 300, 270, 300, 365, 3..."
3,ORA Graphene Audio Inc.,helgeseetzen,My journey took me from tech founder (BrightSi...,4925,['Feed post number 1\nHelge Seetzen\nHelge See...,['Feed post number 1\nHelge Seetzen replied to...,[],"[{'title': 'Managing Partner & CEO', 'company_...","[{'degree': 'PhD, Physics & Computer Science (...",2016,...,"['Congratulations! Bravo to the whole team!!',...",15,"[90, 90, 120, 120, 120, 120, 120, 120, 120, 12...","[465, 70, 465, 465, 465, 465, 465, 465]","[75, 5, 75, 75, 75, 75, 75, 75]","[7, 0, 7, 7, 7, 7, 7, 7]",Helge Seetzen,[],[],[]
4,ORA Graphene Audio Inc.,regaskell,Robert-Eric Gaskell is an experienced audio pr...,888,[],"['Feed post number 1\nRobert-Eric Gaskell, Ph....","[""Feed post number 1\nRobert-Eric Gaskell, Ph....","[{'title': 'Co-Founder, Inventor, VP Product',...","[{'degree': 'PhD, Sound Recording', 'instituti...",2016,...,"['Which is it? ""100% pure graphene"" or multi l...",1,[1825],[48],[12],[1],"Robert-Eric Gaskell, Ph.D.","[""Thank you Robert-Eric Gaskell, Ph.D. and Ari...","['likes this', 'likes this', 'likes this', 'li...","[21, 30, 30, 150, 300, 300, 300, 330, 330, 330..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15902,Particle Code;https://www.crunchbase.com/organ...,galia-benartzi-1a0aa220,"I am a technology entrepreneur, currently work...",5108,[],"[""Feed post number 1\nGalia Benartzi commented...",['Feed post number 1\nGalia Benartzi likes thi...,"[{'title': 'Co-Founder, Business Development',...","[{'degree': '', 'institution': 'Johns Hopkins ...",2009-01-01,...,['Been working with Itay for years and happy t...,2,"[1460, 1825]","[50, 7]","[11, 2]","[16, 0]",Galia Benartzi,"[""Super excited to finally announce that I've ...","['likes this', 'likes this', 'likes this', 'li...","[30, 730, 1095, 1460, 1825, 3285, 3650, 4015, ..."
15903,Particle Code;https://www.crunchbase.com/organ...,yotamshacham,Game developer who become a technology entrepr...,1564,['Feed post number 1\nYotam Shacham\nYotam Sha...,['Feed post number 1\nYotam Shacham replied to...,"[""Feed post number 1\nYotam Shacham likes this...","[{'title': 'Chief Technology Officer', 'compan...","[{'degree': ""Bachelor's degree, Computer Scien...",2009-01-01,...,['Yuyi Kitano Lum is an email deliverability e...,11,"[240, 240, 240, 240, 240, 240, 365, 365, 365, ...","[38, 38, 100, 93, 87, 98, 173, 148, 49]","[7, 7, 10, 12, 7, 20, 34, 25, 5]","[0, 0, 0, 0, 0, 0, 3, 0, 0]",Yotam Shacham,"[""🚀 Revolutionizing Medical Malpractice Insura...","['likes this', 'likes this', 'likes this', 'li...","[30, 730, 90, 120, 150, 180, 210, 240, 270, 30..."
15904,Particle Code;https://www.crunchbase.com/organ...,yudi-levi-4bb91911,,4533,['Feed post number 1\nYudi Levi\nYudi Levi\n •...,[],['Feed post number 1\nYudi Levi likes this\nJo...,"[{'title': 'Chief Architect', 'company_name': ...",[],2009-01-01,...,[],0,[],[],[],[],,['Calcalist כלכליסט picked up my LinkedIn post...,[],[365]
15905,myDocket;https://www.crunchbase.com/organizati...,jasonwesbecher,Twenty three-year career as metrics-driven sal...,3743,"[""Feed post number 1\nJason Wesbecher\nJason W...",['Feed post number 1\nJason Wesbecher commente...,"[""Feed post number 1\nJason Wesbecher likes th...","[{'title': 'Chief Executive Officer', 'company...","[{'degree': 'Economics, The Wharton School of ...",2012-06-01,...,['I miss you Gio!'],1,[365],"[169, 150, 261, 11, 168, 22, 174, 222]","[41, 63, 32, 13, 74, 5, 55, 61]","[2, 0, 1, 1, 1, 0, 0, 0]",Jason Wesbecher,"[""My sales team invited me to three sales call...","['likes this', 'likes this', 'likes this', 'li...","[7, 730, 21, 120, 150, 180, 210, 240, 270, 365..."


In [21]:
final_combined_df = final_combined_df[['Organization Name', 'username', 'about', 'followers',
       'posts', 'comments', 'reactions', 'experience', 'education',
       'Founded Date', 'equity_raised_in_5_years', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Equity_raised_until_now',
       'Success_until_now', 'cleaned_posts', 'num_posts', 'num_reposted_posts',
       'post_times', 'engagements_posts', 'comments_on_posts',
       'reposts_number_posts', 'cleaned_comments', 'num_comments',
       'comment_days_ago', 'engagements_comments', 'comments_on_comments',
       'reposts_number_comments', 'account_name', 'cleaned_reactions',
       'reaction_types', 'new_post_times']]

In [22]:
final_combined_df_first_half[['Success_after_5_years', "cleaned_posts"]].groupby('Success_after_5_years').count()


Unnamed: 0_level_0,cleaned_posts
Success_after_5_years,Unnamed: 1_level_1
-1,1450
0,13484
1,973


In [205]:
final_combined_df.columns

Index(['Organization Name', 'username', 'about', 'followers', 'posts',
       'comments', 'reactions', 'experience', 'education', 'Founded Date',
       'equity_raised_in_5_years', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Equity_raised_until_now',
       'Success_until_now', 'cleaned_posts', 'num_posts', 'num_reposted_posts',
       'engagements_posts', 'comments_on_posts', 'reposts_number_posts',
       'post_times', 'cleaned_comments', 'num_comments', 'comment_days_ago',
       'account_name', 'engagements_comments', 'cleaned_reactions',
       'reaction_types', 'reaction_times'],
      dtype='object')

In [24]:
# Rename a column in the DataFrame
final_combined_df = final_combined_df.rename(columns={"new_post_times": "reaction_times"})


In [25]:
final_combined_df = pd.concat([final_combined_df_first_half, final_combined_df], axis=0, ignore_index=True)


In [26]:
final_combined_df[['Success_after_5_years', "cleaned_posts"]].groupby('Success_after_5_years').count()

Unnamed: 0_level_0,cleaned_posts
Success_after_5_years,Unnamed: 1_level_1
-1,1450
0,19404
1,1026


In [27]:
# Function to convert input values to lists
def to_list(x):
    """
    Converts input values to a list format.

    Parameters:
    x: The input value (can be string, list, or other types).

    Returns:
    list: A list representation of the input value.
    """
    if isinstance(x, str):  # If input is a string
        try:
            return ast.literal_eval(x)  # Safely evaluate strings that look like Python lists
        except (ValueError, SyntaxError):
            return []  # Return an empty list if evaluation fails
    elif isinstance(x, list):  # If input is already a list
        return x
    else:  # For all other types (e.g., NaN)
        return []

# Apply the conversion to the 'cleaned_reactions' column
final_combined_df['cleaned_reactions'] = final_combined_df['cleaned_reactions'].apply(to_list)
# This ensures all entries in the 'cleaned_reactions' column are converted to lists.

# Display the updated DataFrame to verify the changes
print(final_combined_df)


In [28]:
# Step 1: Filter rows where 'Success_after_5_years' equals -1
filtered_data = final_combined_df[
    final_combined_df['Success_after_5_years'] == -1
].copy()
# - This filters the DataFrame to include only rows where 'Success_after_5_years' is -1.
# - `.copy()` ensures that modifications to `filtered_data` won't affect the original DataFrame.

# Step 2: Ensure 'cleaned_posts' is a list and is not empty
filtered_data = filtered_data[
    filtered_data['cleaned_posts'].apply(lambda x: isinstance(x, list) and len(x) > 0)
]
# - The `apply` function checks each element in the 'cleaned_posts' column:
#   1. `isinstance(x, list)`: Ensures the value is a list.
#   2. `len(x) > 0`: Ensures the list is not empty.
# - Rows that fail these checks are excluded from the filtered DataFrame.

# Step 3: Display the filtered DataFrame
print(filtered_data)
# - This prints the resulting filtered DataFrame to verify the filtering process.



Empty DataFrame
Columns: [Organization Name, username, about, followers, posts, comments, reactions, experience, education, Founded Date, equity_raised_in_5_years, Success_after_5_years, equity_rounds_raised_after_5_years, Equity_raised_until_now, Success_until_now, cleaned_posts, num_posts, num_reposted_posts, post_times, engagements_posts, comments_on_posts, reposts_number_posts, cleaned_comments, num_comments, comment_days_ago, engagements_comments, comments_on_comments, reposts_number_comments, account_name, cleaned_reactions, reaction_types, reaction_times]
Index: []

[0 rows x 32 columns]


In [29]:
final_combined_df = final_combined_df[['Organization Name', 'username', 'about',
       'followers', 'posts', 'comments', 'reactions', 'experience',
       'education', 'Founded Date', 'equity_raised_in_5_years',
       'Success_after_5_years', 'equity_rounds_raised_after_5_years',
       'Equity_raised_until_now', 'Success_until_now',
       'cleaned_posts', 'num_posts', 'num_reposted_posts', 'engagements_posts',
       'comments_on_posts', 'reposts_number_posts', 'post_times',
       'cleaned_comments', 'num_comments', 'comment_days_ago', 'account_name',
       'engagements_comments', 'cleaned_reactions', 'reaction_types',
       'reaction_times']]

In [30]:
final_combined_df.loc[1, "education"]

"[{'degree': 'Doctor of Philosophy (Ph.D.), Materials Engineering', 'institution': 'McGill University', 'uni_url': 'https://www.linkedin.com/company/4855/', 'start_date': '2012', 'end_date': '2017'}, {'degree': 'Bachelor of Applied Science (B.A.Sc.), Materials science and engineering', 'institution': 'University of Toronto', 'uni_url': 'https://www.linkedin.com/company/3660/', 'start_date': '2008', 'end_date': '2012'}]"

## 2. Extraction of Experience an Education Features & Data Preparation for NLP Text analysis

In [None]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split

In [31]:
# Function to split data into training and test sets
def train_test_split_data(data, test_size=0.3, random_state=42):
    """
    Splits a dataset into training and test sets.

    Parameters:
    data (pd.DataFrame or array-like): The dataset to split.
    test_size (float): The proportion of the dataset to include in the test split (default is 0.3).
    random_state (int): The random seed to ensure reproducibility (default is 42).

    Returns:
    tuple: A tuple containing the training data and test data.
    """
    # Perform the train-test split
    train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state)
    
    # Verify the result by printing the number of rows in each split
    print(f"Number of rows in training data: {len(train_data)}")
    print(f"Number of rows in test data: {len(test_data)}")

    # Return the training and test datasets
    return train_data, test_data

In [32]:
train_combined_data, test_combined_data = train_test_split_data(final_combined_df, test_size=0.3, random_state=42)

Number of rows in training data: 15316
Number of rows in test data: 6564


In [33]:
train_combined_data[train_combined_data['Success_after_5_years']==-1][["cleaned_posts", 'Success_after_5_years']]

Unnamed: 0,cleaned_posts,Success_after_5_years
14798,['Ready for pumpkin spice season! 🍂🎃 I just li...,-1
14399,"[""Fishbowl\nFishbowl\n17,912 followers\n17,912...",-1
14053,['Great to be in San Diego! Energised for Sale...,-1
13538,[],-1
13789,"['After 4 years of work, ""Dungeons of Hinterbe...",-1
...,...,...
13773,"[""The last watch party was so much fun that we...",-1
15265,['Filament Limited\nFilament Limited\n889 foll...,-1
15422,[],-1
15787,"[""Hi all! Hato Hub is looking for an APM (and ...",-1


### Additional split for train for the supervised feature creation
For Supervised feature creatoin, we have to preven information leakage, make sure that they were not trained on the same data as they will be used for the final prediction.

In [34]:
train_feature_combined_data, train_final_prediction_combined_data = train_test_split_data(train_combined_data, test_size=0.5, random_state=42)

Number of rows in training data: 7658
Number of rows in test data: 7658


In [204]:
train_final_prediction_combined_data.to_csv(r"C:\Users\Benja\Downloads\train_data_numerical_features.csv")
test_combined_data.to_csv(r"C:\Users\Benja\Downloads\test_data_numerical_features.csv")

In [173]:
experience_education_df = train_feature_combined_data[['username', 'Success_after_5_years', 'experience', 'education']]
train_model_experience_education_df = train_final_prediction_combined_data[['username', 'Success_after_5_years', 'experience', 'education']]
test_experience_education_df = test_combined_data[['username', 'Success_after_5_years', 'experience', 'education']]
experience_education_df.loc[5, 'education']

"[{'degree': 'PhD, Electrical and Computer Engineering', 'institution': 'McGill University', 'uni_url': 'https://www.linkedin.com/company/4855/', 'start_date': '2009', 'end_date': '2014'}, {'degree': 'M.Eng, Electrical and Computer Engineering', 'institution': 'McGill University', 'uni_url': 'https://www.linkedin.com/company/4855/', 'start_date': '2007', 'end_date': '2009'}, {'degree': 'Bachelor of Science (BS), Physics', 'institution': 'University of Oregon', 'uni_url': 'https://www.linkedin.com/company/5827/', 'start_date': '2000', 'end_date': '2004'}]"

### 2.1 Experience and Education Features

#### 2.1.1 University Prestige

In [174]:
# Initialize dictionaries to store statistics for companies and universities
company_stats = {}
university_stats = {}

# Iterate over each row in the DataFrame to process experience and education
for idx, row in experience_education_df.iterrows():
    success_score = row['Success_after_5_years']  # Success or failure indicator
    
    # Process experience (extract company information)
    experiences = ast.literal_eval(row['experience'])  # Safely convert string to a list of dictionaries
    for experience in experiences:
        company = experience['company_name']  # Extract company name
        if company not in company_stats:
            # Initialize stats for a new company
            company_stats[company] = {'success': 0, 'failure': 0, 'count': 0}
        company_stats[company]['count'] += 1  # Increment count
        if success_score == 1:
            company_stats[company]['success'] += 1  # Increment success count
        elif success_score == -1:
            company_stats[company]['failure'] += 1  # Increment failure count
    
    # Process education (extract university information)
    education_entries = ast.literal_eval(row['education'])  # Safely convert string to a list of dictionaries
    for education in education_entries:
        university = education['institution']  # Extract university name
        if university not in university_stats:
            # Initialize stats for a new university
            university_stats[university] = {'success': 0, 'failure': 0, 'count': 0}
        university_stats[university]['count'] += 1  # Increment count
        if success_score == 1:
            university_stats[university]['success'] += 1  # Increment success count
        elif success_score == -1:
            university_stats[university]['failure'] += 1  # Increment failure count

# Prepare data for company statistics
company_data = []
for company, stats in company_stats.items():
    # Calculate success and failure percentages
    success_percentage = (stats['success'] / stats['count']) * 100 if stats['count'] > 0 else 0
    failure_percentage = (stats['failure'] / stats['count']) * 100 if stats['count'] > 0 else 0
    company_data.append([company, success_percentage, failure_percentage, stats['count']])

# Prepare data for university statistics
university_data = []
for university, stats in university_stats.items():
    # Calculate success and failure percentages
    success_percentage = (stats['success'] / stats['count']) * 100 if stats['count'] > 0 else 0
    failure_percentage = (stats['failure'] / stats['count']) * 100 if stats['count'] > 0 else 0
    university_data.append([university, success_percentage, failure_percentage, stats['count']])

# Convert to DataFrames for companies and universities
company_df = pd.DataFrame(company_data, columns=['Company', 'Success_Score', 'Failure_Score', 'Count'])
university_df = pd.DataFrame(university_data, columns=['University', 'Success_Score', 'Failure_Score', 'Count'])

# Display the DataFrames to verify results
print("Company Statistics DataFrame:")
print(company_df.head())
print("\nUniversity Statistics DataFrame:")
print(university_df.head())

# Optional: Save the DataFrames to CSV files
company_df.to_csv("company_statistics.csv", index=False)
university_df.to_csv("university_statistics.csv", index=False)


In [175]:
company_5_df = company_df[company_df["Count"]>5]
company_10_df = company_df[company_df["Count"]>10]
company_20_df = company_df[company_df["Count"]>20]
company_10_df[:50]

Unnamed: 0,Company,Success_Score,Failure_Score,Count
62,Self-employed,4.316547,11.510791,139
103,IBM,3.100775,1.550388,129
146,Intuit,13.333333,26.666667,15
201,SpaceX,7.142857,0.0,14
231,Apple ·,6.25,25.0,16
289,Forbes Technology Council,8.695652,13.043478,23
293,Juniper Networks,21.428571,0.0,14
332,Career Break,5.769231,11.538462,52
336,Google,18.518519,10.582011,189
370,University of Michigan,0.0,12.5,16


In [176]:
university_5_df = university_df[university_df["Count"]>5]
university_10_df = university_df[university_df["Count"]>10]
university_20_df = university_df[university_df["Count"]>20]
university_10_df.sort_values(by='Success_Score', ascending=False)[:50].reset_index(drop=True)

Unnamed: 0,University,Success_Score,Failure_Score,Count
0,University of Toronto - Rotman School of Manag...,25.0,6.25,16
1,Caltech,22.222222,14.814815,27
2,Brandeis University,18.181818,0.0,11
3,Wesleyan University,18.181818,0.0,11
4,Delft University of Technology,18.181818,0.0,11
5,William & Mary,18.181818,18.181818,11
6,Concordia University,16.666667,0.0,12
7,University of Warwick,16.666667,8.333333,12
8,Y Combinator,16.438356,32.876712,73
9,Stanford University Graduate School of Business,15.555556,2.222222,90


In [177]:
university_10_df.sort_values(by='Failure_Score', ascending=False)[:50].reset_index(drop=True)

Unnamed: 0,University,Success_Score,Failure_Score,Count
0,University College Dublin,0.0,42.857143,14
1,Reforge,0.0,36.363636,11
2,Y Combinator,16.438356,32.876712,73
3,University of Nebraska-Lincoln,0.0,30.769231,13
4,Università Bocconi,0.0,30.769231,13
5,"Indian Institute of Technology, Delhi",0.0,29.411765,17
6,USC Marshall School of Business,5.882353,23.529412,17
7,University of Central Florida,7.692308,23.076923,13
8,Oregon State University,7.692308,23.076923,13
9,MIT Sloan School of Management,6.666667,22.222222,45


In [178]:
# Function to calculate scores and counts for companies and universities
def calculate_scores_and_counts(row, company_df, university_df):
    """
    Calculates aggregated success and failure scores for companies and universities 
    based on the provided row of data and corresponding DataFrames.

    Parameters:
    row (pd.Series): A single row of data containing 'experience' and 'education' columns.
    company_df (pd.DataFrame): DataFrame containing company success and failure stats.
    university_df (pd.DataFrame): DataFrame containing university success and failure stats.

    Returns:
    dict: A dictionary containing aggregated scores and counts for companies and universities.
    """
    # Initialize dictionaries to store scores and counts
    company_columns = {
        f'company_{suffix}_{metric}': 0
        for suffix in ['5', '10', '20'] for metric in ['success', 'failure', 'amount']
    }
    university_columns = {
        f'university_{suffix}_{metric}': 0
        for suffix in ['5', '10', '20'] for metric in ['success', 'failure', 'amount']
    }

    # Process experience (companies)
    experiences = ast.literal_eval(row['experience'])  # Safely convert string to list of dictionaries
    for experience in experiences:
        company = experience['company_name']
        success_score = row['Success_after_5_years']

        # Check if the company exists in each filtered company DataFrame
        for df, suffix in zip([company_5_df, company_10_df, company_20_df], ['5', '10', '20']):
            if company in df['Company'].values:
                company_columns[f'company_{suffix}_amount'] += 1
                company_columns[f'company_{suffix}_success'] += df.loc[df['Company'] == company, 'Success_Score'].values[0]
                company_columns[f'company_{suffix}_failure'] += df.loc[df['Company'] == company, 'Failure_Score'].values[0]

    # Process education (universities)
    education_entries = ast.literal_eval(row['education'])  # Safely convert string to list of dictionaries
    for education in education_entries:
        university = education['institution']

        # Check if the university exists in each filtered university DataFrame
        for df, suffix in zip([university_5_df, university_10_df, university_20_df], ['5', '10', '20']):
            if university in df['University'].values:
                university_columns[f'university_{suffix}_amount'] += 1
                university_columns[f'university_{suffix}_success'] += df.loc[df['University'] == university, 'Success_Score'].values[0]
                university_columns[f'university_{suffix}_failure'] += df.loc[df['University'] == university, 'Failure_Score'].values[0]

    # Calculate averages for success and failure scores
    for suffix in ['5', '10', '20']:
        if company_columns[f'company_{suffix}_amount'] > 0:
            company_columns[f'company_{suffix}_success'] /= company_columns[f'company_{suffix}_amount']
            company_columns[f'company_{suffix}_failure'] /= company_columns[f'company_{suffix}_amount']
        if university_columns[f'university_{suffix}_amount'] > 0:
            university_columns[f'university_{suffix}_success'] /= university_columns[f'university_{suffix}_amount']
            university_columns[f'university_{suffix}_failure'] /= university_columns[f'university_{suffix}_amount']

    # Combine results from companies and universities
    return {**company_columns, **university_columns}

# Function to add scores to a given DataFrame
def add_scores_to_dataframe(dataframe, company_df, university_df):
    """
    Adds aggregated scores and counts for companies and universities to a DataFrame.

    Parameters:
    dataframe (pd.DataFrame): The input DataFrame containing 'experience' and 'education' columns.
    company_df (pd.DataFrame): DataFrame containing company success and failure stats.
    university_df (pd.DataFrame): DataFrame containing university success and failure stats.

    Returns:
    pd.DataFrame: The updated DataFrame with additional columns for scores and counts.
    """
    new_columns = []
    for _, row in dataframe.iterrows():
        result = calculate_scores_and_counts(row, company_df, university_df)
        new_columns.append(result)
    # Create a DataFrame from the new columns and concatenate with the original DataFrame
    new_columns_df = pd.DataFrame(new_columns)
    return pd.concat([dataframe.reset_index(drop=True), new_columns_df.reset_index(drop=True)], axis=1)

# Apply the processing to all relevant DataFrames
experience_education_df = add_scores_to_dataframe(experience_education_df, company_df, university_df)
train_model_experience_education_df = add_scores_to_dataframe(train_model_experience_education_df, company_df, university_df)
test_experience_education_df = add_scores_to_dataframe(test_experience_education_df, company_df, university_df)

print("Scores added to all dataframes successfully.")


Scores added to all dataframes successfully.


In [None]:
# Function to calculate university prestige score
def calculate_university_prestige(row, universities, scores):
    # Convert the experience string into a list of dictionaries
    experiences = ast.literal_eval(row['experience'])
    matched_scores = []

    # Check each company in experience for substring matches (case-insensitive) in University_prestige
    for exp in experiences:
        company_name = exp['company_name'].replace('·', '').strip().lower()
        for i, university in enumerate(universities):
            if university.lower() in company_name:
                matched_scores.append(scores[i])
    
    # Calculate average prestige score and match count
    matched_scores = [score for score in matched_scores if pd.notna(score)]
    if matched_scores:
        avg_score = sum(matched_scores) / len(matched_scores)
        count = len(matched_scores)
    else:
        avg_score = None
        count = 0

    return pd.Series([avg_score, count])

# Extract the list of universities and their scores
universities = University_prestige['Institution'].tolist()
scores = University_prestige['Final_Combined_Score'].tolist()

# Apply the function to the dataframe
experience_education_df[['university_prestige', 'match_count']] = experience_education_df.apply(
    calculate_university_prestige, universities=universities, scores=scores, axis=1)

train_model_experience_education_df[['university_prestige', 'match_count']] = train_model_experience_education_df.apply(
    calculate_university_prestige, universities=universities, scores=scores, axis=1)

test_experience_education_df[['university_prestige', 'match_count']] = test_experience_education_df.apply(
    calculate_university_prestige, universities=universities, scores=scores, axis=1)

#test_experience_education_df[test_experience_education_df["match_count"]>0]


#### 2.1.2 VC or Consulting experience

In [179]:
# Define keywords for identifying VC and Consulting companies
vc_keywords = ['VC', 'venture capital', 'capital']
consulting_keywords = ['consulting', 'consultancy', 'advisory', 'strategy', 'consultants']

# Function to check if the company in the experience is VC or Consulting
def check_vc_or_consulting(experience):
    """
    Determines if a company listed in the experience is a VC (Venture Capital) or Consulting firm.

    Parameters:
    experience (str): String representation of a list of dictionaries, where each dictionary contains company details.

    Returns:
    tuple: A tuple with two boolean values:
        - is_vc: True if any company matches VC keywords.
        - is_consulting: True if any company matches Consulting keywords.
    """
    is_vc = False
    is_consulting = False

    # Check if experience is valid (not NaN or empty)
    if pd.notna(experience) and experience != '':
        try:
            # Convert the string to a list of dictionaries
            experiences = ast.literal_eval(experience)
            for exp in experiences:
                company_name = exp['company_name'].lower()  # Convert to lowercase for case-insensitivity

                # Check for VC keywords
                if any(keyword.lower() in company_name for keyword in vc_keywords):
                    is_vc = True

                # Check for Consulting keywords
                if any(keyword.lower() in company_name for keyword in consulting_keywords):
                    is_consulting = True
        except (ValueError, SyntaxError):  # Handle potential parsing errors
            pass

    return is_vc, is_consulting

# Function to process a DataFrame and add VC/Consulting information
def add_vc_consulting_columns(dataframe):
    """
    Adds columns indicating whether an individual has worked in VC or Consulting to a DataFrame.

    Parameters:
    dataframe (pd.DataFrame): The input DataFrame containing an 'experience' column.

    Returns:
    pd.DataFrame: The original DataFrame with two new columns:
        - 'worked_in_vc': Boolean indicating if the individual worked in VC.
        - 'worked_in_consulting': Boolean indicating if the individual worked in Consulting.
    """
    vc_consulting_info = []
    for idx, row in dataframe.iterrows():
        is_vc, is_consulting = check_vc_or_consulting(row['experience'])  # Check each row
        vc_consulting_info.append({'worked_in_vc': is_vc, 'worked_in_consulting': is_consulting})
    
    # Convert the results into a DataFrame and concatenate with the original DataFrame
    vc_consulting_df = pd.DataFrame(vc_consulting_info)
    return pd.concat([dataframe.reset_index(drop=True), vc_consulting_df.reset_index(drop=True)], axis=1)

# Apply the function to all relevant DataFrames
experience_education_df = add_vc_consulting_columns(experience_education_df).dropna(subset=['username'])
train_model_experience_education_df = add_vc_consulting_columns(train_model_experience_education_df).dropna(subset=['username'])
test_experience_education_df = add_vc_consulting_columns(test_experience_education_df).dropna(subset=['username'])

print("VC and Consulting columns added successfully to all dataframes.")


In [180]:
experience_education_df.head(1)

Unnamed: 0,username,Success_after_5_years,experience,education,company_5_success,company_5_failure,company_5_amount,company_10_success,company_10_failure,company_10_amount,...,university_5_failure,university_5_amount,university_10_success,university_10_failure,university_10_amount,university_20_success,university_20_failure,university_20_amount,worked_in_vc,worked_in_consulting
5499,jerry-nihen-18640a2,0.0,"[{'title': 'Cloud Native Architect', 'company_...",[{'degree': 'Coursework in Electronics & Mathe...,32.0,4.0,2.0,32.0,4.0,2.0,...,11.263158,2.0,3.333333,11.263158,2.0,6.666667,12.0,1.0,False,False


In [181]:
train_model_experience_education_df.head(1)

Unnamed: 0,username,Success_after_5_years,experience,education,company_5_success,company_5_failure,company_5_amount,company_10_success,company_10_failure,company_10_amount,...,university_5_failure,university_5_amount,university_10_success,university_10_failure,university_10_amount,university_20_success,university_20_failure,university_20_amount,worked_in_vc,worked_in_consulting
1761,johnmcnicolkc,0.0,"[{'title': 'Founder and Director', 'company_na...","[{'degree': 'Post Graduate Diploma, Accounting...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False


In [182]:
test_experience_education_df.head(1)

Unnamed: 0,username,Success_after_5_years,experience,education,company_5_success,company_5_failure,company_5_amount,company_10_success,company_10_failure,company_10_amount,...,university_5_failure,university_5_amount,university_10_success,university_10_failure,university_10_amount,university_20_success,university_20_failure,university_20_amount,worked_in_vc,worked_in_consulting
17116,joseph-chen-cmrp-691bb510,0.0,"[{'title': 'Founder & CEO', 'company_name': 'L...","[{'degree': '', 'institution': 'Certified Mark...",,,,,,,...,,,,,,,,,,


In [183]:
experience_education_df = experience_education_df.loc[:, ~experience_education_df.columns.duplicated()]


In [184]:
# Drop rows where 'username' is NaN
experience_education_df = experience_education_df.dropna(subset=['username'])

# Display the updated DataFrame
experience_education_df


Unnamed: 0,username,Success_after_5_years,experience,education,company_5_success,company_5_failure,company_5_amount,company_10_success,company_10_failure,company_10_amount,...,university_5_failure,university_5_amount,university_10_success,university_10_failure,university_10_amount,university_20_success,university_20_failure,university_20_amount,worked_in_vc,worked_in_consulting
5499,jerry-nihen-18640a2,0.0,"[{'title': 'Cloud Native Architect', 'company_...",[{'degree': 'Coursework in Electronics & Mathe...,32.0,4.0,2.0,32.0,4.0,2.0,...,11.263158,2.0,3.333333,11.263158,2.0,6.666667,12.0,1.0,False,False
1816,kousuke,0.0,"[{'title': 'Chief Product Officer', 'company_n...",[],0.0,0.0,1.0,0.0,0.0,1.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,False,False
16801,eamendez01,0.0,"[{'title': 'Director', 'company_name': 'Genesi...","[{'degree': 'MBA, Finance, Accounting, and Eco...",,,,,,,...,,,,,,,,,,
21312,giovannibarillari,0.0,"[{'title': 'Site Reliability Engineer', 'compa...","[{'degree': 'Bachelor of Science (BS), Physics...",,,,,,,...,,,,,,,,,,
20024,arbel-freiman,0.0,"[{'title': 'Senior Backend Developer', 'compan...","[{'degree': 'Bachelor of Science (B.Sc.), Comp...",,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13185,philippemnoel,-1.0,"[{'title': 'Co-Founder, CEO', 'company_name': ...","[{'degree': 'Bachelor of Arts - BA, Cum Laude ...",,,,,,,...,,,,,,,,,,
9013,dixondoll,0.0,"[{'title': 'Investor/Board Member', 'company_n...","[{'degree': 'Master of Science (MS), Electrica...",,,,,,,...,,,,,,,,,False,False
11678,richard-medal-p-eng-0999375a,0.0,"[{'title': 'Director of Operations', 'company_...","[{'degree': 'BASc, Electrical Engineering', 'i...",,,,,,,...,,,,,,,,,False,False
18740,nidhi-sachdeva-toronto,0.0,"[{'title': 'Sessional Lecturer', 'company_name...","[{'degree': 'Doctor of Philosophy - PhD, Educa...",,,,,,,...,,,,,,,,,,


In [185]:
experience_education_df[experience_education_df["worked_in_vc"]==True]

Unnamed: 0,username,Success_after_5_years,experience,education,company_5_success,company_5_failure,company_5_amount,company_10_success,company_10_failure,company_10_amount,...,university_5_failure,university_5_amount,university_10_success,university_10_failure,university_10_amount,university_20_success,university_20_failure,university_20_amount,worked_in_vc,worked_in_consulting
1352,chienhungchen,0.0,"[{'title': 'Head Of Technology', 'company_name...","[{'degree': 'Master of Science (MS), Computer ...",0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,9.090909,2.0,12.045455,9.090909,2.0,9.090909,18.181818,1.0,True,False
7251,magdalevinphd,0.0,"[{'title': 'Bilingual Educator Consultant', 'c...",[{'degree': 'Doctor of Philosophy (Ph.D.) Cand...,0.714286,3.571429,2.0,1.428571,7.142857,1.0,...,2.500000,2.0,3.333333,2.500000,2.0,6.666667,5.000000,1.0,True,False
1717,tiffany-davis-b9968151,0.0,"[{'title': 'Executive Business Partner', 'comp...","[{'degree': '', 'institution': 'Foothill Colle...",15.000000,5.000000,1.0,15.000000,5.000000,1.0,...,11.688312,2.0,3.030303,9.090909,1.0,3.030303,9.090909,1.0,True,False
4928,vardan-markosyan-007,0.0,"[{'title': 'Founder & CEO', 'company_name': 'H...","[{'degree': 'Bachelor’s Degree, BS degree in m...",4.316547,11.510791,1.0,4.316547,11.510791,1.0,...,0.000000,2.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,True,False
7053,kuttas,0.0,"[{'title': 'Technical Fellow', 'company_name':...","[{'degree': 'BS , Computer Science', 'institut...",0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,1.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2311,amybetz,0.0,"[{'title': 'Marketing Manager', 'company_name'...","[{'degree': ""Bachelor's Degree, Biology / Orga...",0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,1.0,8.333333,0.000000,1.0,0.000000,0.000000,0.0,True,False
2296,kyledumont,0.0,"[{'title': 'CTO & Co-founder', 'company_name':...",[{'degree': 'Master of Business Administration...,14.747475,14.646465,4.0,14.747475,14.646465,4.0,...,1.388889,3.0,4.166667,4.166667,1.0,4.166667,4.166667,1.0,True,False
7329,ntereshchenko,0.0,[{'title': 'International Student Coordinator'...,"[{'degree': 'Master of Arts - MA, Visual and M...",4.761905,15.873016,1.0,4.761905,15.873016,1.0,...,17.605634,1.0,7.042254,17.605634,1.0,7.042254,17.605634,1.0,True,False
2982,piyushmahapatra,0.0,"[{'title': 'Chief Innovation Officer', 'compan...","[{'degree': 'BA (hons), Medicine', 'institutio...",0.000000,50.000000,1.0,0.000000,0.000000,0.0,...,18.181818,2.0,18.181818,18.181818,2.0,0.000000,0.000000,0.0,True,False


In [186]:
experience_education_df.columns

Index(['username', 'Success_after_5_years', 'experience', 'education',
       'company_5_success', 'company_5_failure', 'company_5_amount',
       'company_10_success', 'company_10_failure', 'company_10_amount',
       'company_20_success', 'company_20_failure', 'company_20_amount',
       'university_5_success', 'university_5_failure', 'university_5_amount',
       'university_10_success', 'university_10_failure',
       'university_10_amount', 'university_20_success',
       'university_20_failure', 'university_20_amount', 'worked_in_vc',
       'worked_in_consulting'],
      dtype='object')

In [187]:
experience_education_df.loc[5499, 'experience']

"[{'title': 'Cloud Native Architect', 'company_name': 'Portworx by Pure Storage ·', 'company_linked_url': 'https://www.linkedin.com/company/6578150/', 'location': 'Portland, Oregon Metropolitan Area', 'start_date': 'Jul 2022', 'end_date': 'Present'}, {'title': 'Systems Engineer', 'company_name': 'Veeam Software ·', 'company_linked_url': 'https://www.linkedin.com/company/236413/', 'location': 'Portland, Oregon Metropolitan Area', 'start_date': 'Feb 2021', 'end_date': 'Jul 2022'}, {'title': 'Senior Systems Engineer', 'company_name': 'CMG Financial ·', 'company_linked_url': 'https://www.linkedin.com/company/99772/', 'location': 'Roseville, California, United States', 'start_date': 'Dec 2019', 'end_date': 'Feb 2021'}, {'title': 'Manager of Managed Services', 'company_name': 'DSA Technologies, Inc.', 'company_linked_url': 'https://www.linkedin.com/company/2307236/', 'location': 'Sacramento, California Area', 'start_date': '2019', 'end_date': '2019'}, {'title': 'Senior Systems Consultant', '

In [188]:
University_prestige = pd.read_csv(r"C:\Users\Benja\Downloads\final_combined_rankings.csv")
University_prestige

Unnamed: 0,Institution,Institution.1,Final_Combined_Score
0,fudan university,Fudan University,10.000000
1,harvard university,Harvard University,9.724189
2,university of cambridge,University of Cambridge,9.611500
3,university of oxford,University of Oxford,9.592738
4,stanford university,Stanford University,9.566553
...,...,...,...
4578,bentley (mccallum),Bentley (McCallum),0.408498
4579,solbridge international school of business,SolBridge International School of Business,0.408498
4580,university of massachusetts (dartmouth),University of Massachusetts (Dartmouth),0.408498
4581,university of sarajevo,,0.013773


In [189]:
University_prestige = University_prestige.drop_duplicates(subset='Institution', keep='first')
University_prestige

Unnamed: 0,Institution,Institution.1,Final_Combined_Score
0,fudan university,Fudan University,10.000000
1,harvard university,Harvard University,9.724189
2,university of cambridge,University of Cambridge,9.611500
3,university of oxford,University of Oxford,9.592738
4,stanford university,Stanford University,9.566553
...,...,...,...
4577,claremont (drucker),Claremont (Drucker),0.408498
4578,bentley (mccallum),Bentley (McCallum),0.408498
4579,solbridge international school of business,SolBridge International School of Business,0.408498
4580,university of massachusetts (dartmouth),University of Massachusetts (Dartmouth),0.408498


In [193]:
Company_prestige = pd.read_csv(r"C:\Users\Benja\Downloads\company_prestige_normalized.csv")


In [194]:
Company_prestige

Unnamed: 0,name,Normalized_Score_fortune,Normalized_Score_forbes,Normalized_Score_5_10
0,walmart,10.00000,9.909910,10.000000
1,apple,9.93988,9.959960,9.997702
2,amazon,9.97996,9.894895,9.992000
3,berkshire hathaway,9.87976,9.984985,9.989693
4,unitedhealth group,9.91984,9.884885,9.975996
...,...,...,...,...
2171,merlin properties socimi s.a,,0.020020,5.004569
2172,heico,,0.005005,5.001142
2173,w.p. carey,,0.000000,5.000000
2174,robert half international,0.00000,,5.000000


#### 2.1.2 Company Prestige

In [195]:
# Function to calculate company prestige scores and match counts
def calculate_company_prestige(row, companies, scores_fortune, scores_forbes, scores_5_10):
    """
    Calculates average prestige scores and match counts for companies in the experience column.

    Parameters:
    row (pd.Series): A single row of the DataFrame containing the 'experience' column.
    companies (list): List of company names to match against.
    scores_fortune (list): List of normalized Fortune scores corresponding to companies.
    scores_forbes (list): List of normalized Forbes scores corresponding to companies.
    scores_5_10 (list): List of normalized 5-10-year success scores corresponding to companies.

    Returns:
    pd.Series: A Series containing average scores and match counts for each prestige type.
    """
    # Convert the experience string into a list of dictionaries
    experiences = ast.literal_eval(row['experience'])
    
    # Initialize lists for matched scores for each score type
    matched_scores_fortune = []
    matched_scores_forbes = []
    matched_scores_5_10 = []

    # Iterate through experiences and match companies
    for exp in experiences:
        company_name = exp['company_name'].replace('·', '').strip().lower()  # Normalize company name
        for i, company in enumerate(companies):
            if company.lower() in company_name:  # Check for substring match
                # Append scores for each category if they exist
                if scores_fortune[i] is not None:
                    matched_scores_fortune.append(scores_fortune[i])
                if scores_forbes[i] is not None:
                    matched_scores_forbes.append(scores_forbes[i])
                if scores_5_10[i] is not None:
                    matched_scores_5_10.append(scores_5_10[i])

    # Remove NaN values from matched scores
    matched_scores_fortune = [score for score in matched_scores_fortune if pd.notna(score)]
    matched_scores_forbes = [score for score in matched_scores_forbes if pd.notna(score)]
    matched_scores_5_10 = [score for score in matched_scores_5_10 if pd.notna(score)]

    # Calculate average scores
    avg_score_fortune = sum(matched_scores_fortune) / len(matched_scores_fortune) if matched_scores_fortune else None
    avg_score_forbes = sum(matched_scores_forbes) / len(matched_scores_forbes) if matched_scores_forbes else None
    avg_score_5_10 = sum(matched_scores_5_10) / len(matched_scores_5_10) if matched_scores_5_10 else None

    # Calculate match counts directly from the lengths of the matched score lists
    match_count_fortune = len(matched_scores_fortune)
    match_count_forbes = len(matched_scores_forbes)
    match_count_5_10 = len(matched_scores_5_10)

    # Return the averages and match counts for each score type
    return pd.Series([
        avg_score_fortune, avg_score_forbes, avg_score_5_10,
        match_count_fortune, match_count_forbes, match_count_5_10
    ])

# Extract the list of companies and their scores
companies = Company_prestige['name'].tolist()
scores_fortune = Company_prestige['Normalized_Score_fortune'].tolist()
scores_forbes = Company_prestige['Normalized_Score_forbes'].tolist()
scores_5_10 = Company_prestige['Normalized_Score_5_10'].tolist()

# Apply the function to the experience_education_df DataFrame
experience_education_df[
    [
        'company_prestige_fortune', 
        'company_prestige_forbes', 
        'company_prestige_5_10', 
        'match_count_fortune', 
        'match_count_forbes', 
        'match_count_5_10'
    ]
] = experience_education_df.apply(
    calculate_company_prestige, 
    companies=companies, 
    scores_fortune=scores_fortune, 
    scores_forbes=scores_forbes, 
    scores_5_10=scores_5_10, 
    axis=1
)

# Apply the function to the train_model_experience_education_df DataFrame
train_model_experience_education_df[
    [
        'company_prestige_fortune', 
        'company_prestige_forbes', 
        'company_prestige_5_10', 
        'match_count_fortune', 
        'match_count_forbes', 
        'match_count_5_10'
    ]
] = train_model_experience_education_df.apply(
    calculate_company_prestige, 
    companies=companies, 
    scores_fortune=scores_fortune, 
    scores_forbes=scores_forbes, 
    scores_5_10=scores_5_10, 
    axis=1
)

# Apply the function to the test_experience_education_df DataFrame
test_experience_education_df[
    [
        'company_prestige_fortune', 
        'company_prestige_forbes', 
        'company_prestige_5_10', 
        'match_count_fortune', 
        'match_count_forbes', 
        'match_count_5_10'
    ]
] = test_experience_education_df.apply(
    calculate_company_prestige, 
    companies=companies, 
    scores_fortune=scores_fortune, 
    scores_forbes=scores_forbes, 
    scores_5_10=scores_5_10, 
    axis=1
)

# Display confirmation message
print("Company prestige scores and match counts added to all DataFrames.")


In [196]:
test_experience_education_df

Unnamed: 0,username,Success_after_5_years,experience,education,company_5_success,company_5_failure,company_5_amount,company_10_success,company_10_failure,company_10_amount,...,worked_in_vc,worked_in_consulting,university_prestige,match_count,company_prestige_fortune,company_prestige_forbes,company_prestige_5_10,match_count_fortune,match_count_forbes,match_count_5_10
17116,joseph-chen-cmrp-691bb510,0.0,"[{'title': 'Founder & CEO', 'company_name': 'L...","[{'degree': '', 'institution': 'Certified Mark...",,,,,,,...,,,,0.0,5.991984,7.711211,7.553100,6.0,10.0,11.0
21025,dylan-lawhon,0.0,"[{'title': 'Bug Bounty Hunter', 'company_name'...","[{'degree': 'Applied Cybersecuirty', 'institut...",,,,,,,...,,,,0.0,8.717435,9.749750,9.670760,2.0,2.0,2.0
3115,victor-i,0.0,"[{'title': 'Founder & Managing Partner', 'comp...","[{'degree': ""Bachelor's degree, Finance"", 'ins...",3.448276,3.448276,1.0,3.448276,3.448276,1.0,...,False,False,,0.0,,3.753754,5.856634,0.0,1.0,1.0
14919,davidstavens,1.0,"[{'title': 'Advisor', 'company_name': 'Future ...","[{'degree': 'Ph.D., Computer Science', 'instit...",,,,,,,...,,,9.566553,1.0,8.056112,8.713714,9.283410,1.0,1.0,1.0
15700,melissajamesmorrison,-1.0,"[{'title': 'Partner', 'company_name': 'Tourism...",[{'degree': 'Graduate - Company Directors Cour...,,,,,,,...,,,,0.0,,,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16266,broussarded,0.0,"[{'title': 'Co-founder and Managing Director',...",[{'degree': 'Business and Climate Change: Towa...,,,,,,,...,,,4.802574,1.0,8.717435,9.364364,8.359914,1.0,2.0,2.0
5836,amit-pilowsky-2478281,0.0,"[{'title': 'Boad member', 'company_name': 'Fun...","[{'degree': 'MBA, Business', 'institution': 'I...",18.518519,10.582011,2.0,18.518519,10.582011,2.0,...,False,False,,0.0,8.216433,,6.875049,1.0,0.0,1.0
11286,vasil-karpitski-29142619,0.0,[{'title': 'Sales and Business Development Con...,"[{'degree': '', 'institution': 'Belarusian Sta...",,,,,,,...,,,,0.0,9.432198,9.898232,9.867758,3.0,3.0,3.0
8354,steven-wasserman-10084b63,0.0,"[{'title': 'Vice Chairman', 'company_name': 'R...","[{'degree': '', 'institution': 'Chair- YPO - Y...",,,,,,,...,False,False,,0.0,5.330661,6.821822,6.443358,1.0,2.0,3.0


In [197]:
experience_education_df

Unnamed: 0,username,Success_after_5_years,experience,education,company_5_success,company_5_failure,company_5_amount,company_10_success,company_10_failure,company_10_amount,...,worked_in_vc,worked_in_consulting,university_prestige,match_count,company_prestige_fortune,company_prestige_forbes,company_prestige_5_10,match_count_fortune,match_count_forbes,match_count_5_10
5499,jerry-nihen-18640a2,0.0,"[{'title': 'Cloud Native Architect', 'company_...",[{'degree': 'Coursework in Electronics & Mathe...,32.0,4.0,2.0,32.0,4.0,2.0,...,False,False,,0.0,,,,0.0,0.0,0.0
1816,kousuke,0.0,"[{'title': 'Chief Product Officer', 'company_n...",[],0.0,0.0,1.0,0.0,0.0,1.0,...,False,False,,0.0,3.186373,,5.727153,1.0,0.0,1.0
16801,eamendez01,0.0,"[{'title': 'Director', 'company_name': 'Genesi...","[{'degree': 'MBA, Finance, Accounting, and Eco...",,,,,,,...,,,,0.0,7.264529,7.762763,8.181007,2.0,3.0,3.0
21312,giovannibarillari,0.0,"[{'title': 'Site Reliability Engineer', 'compa...","[{'degree': 'Bachelor of Science (BS), Physics...",,,,,,,...,,,,0.0,,,,0.0,0.0,0.0
20024,arbel-freiman,0.0,"[{'title': 'Senior Backend Developer', 'compan...","[{'degree': 'Bachelor of Science (B.Sc.), Comp...",,,,,,,...,,,6.011738,1.0,,,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13185,philippemnoel,-1.0,"[{'title': 'Co-Founder, CEO', 'company_name': ...","[{'degree': 'Bachelor of Arts - BA, Cum Laude ...",,,,,,,...,,,,0.0,9.759519,9.939940,9.951974,1.0,1.0,1.0
9013,dixondoll,0.0,"[{'title': 'Investor/Board Member', 'company_n...","[{'degree': 'Master of Science (MS), Electrica...",,,,,,,...,False,False,,0.0,8.777555,7.207207,7.874495,1.0,2.0,2.0
11678,richard-medal-p-eng-0999375a,0.0,"[{'title': 'Director of Operations', 'company_...","[{'degree': 'BASc, Electrical Engineering', 'i...",,,,,,,...,False,False,,0.0,,,,0.0,0.0,0.0
18740,nidhi-sachdeva-toronto,0.0,"[{'title': 'Sessional Lecturer', 'company_name...","[{'degree': 'Doctor of Philosophy - PhD, Educa...",,,,,,,...,,,4.809568,1.0,6.232465,4.682182,7.007862,1.0,2.0,2.0


In [202]:
test_experience_education_df.to_csv(r"C:\Users\Benja\Downloads\test_experience_education_df.csv")
train_model_experience_education_df.to_csv(r"C:\Users\Benja\Downloads\train_experience_education_df.csv")


In [199]:
merged_df = pd.merge(experience_education_df, train_model_experience_education_df, on='username', how='inner') 

In [200]:
train_model_experience_education_df

Unnamed: 0,username,Success_after_5_years,experience,education,company_5_success,company_5_failure,company_5_amount,company_10_success,company_10_failure,company_10_amount,...,worked_in_vc,worked_in_consulting,university_prestige,match_count,company_prestige_fortune,company_prestige_forbes,company_prestige_5_10,match_count_fortune,match_count_forbes,match_count_5_10
1761,johnmcnicolkc,0.0,"[{'title': 'Founder and Director', 'company_na...","[{'degree': 'Post Graduate Diploma, Accounting...",0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,,0.0,,7.662663,6.748675,0.0,1.0,1.0
2292,michaelliddell11,0.0,"[{'title': 'Managing Partner', 'company_name':...","[{'degree': 'Bachelor of Science - BS, Electri...",0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,,0.0,5.671343,5.467968,6.607243,2.0,4.0,5.0
721,marceloeichelberger,0.0,"[{'title': 'Data Engineer', 'company_name': 'P...","[{'degree': 'Bachelor of Information Sciences,...",0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,,0.0,,4.259259,5.971994,0.0,1.0,1.0
21018,robinleeuwerke,0.0,[{'title': 'Senior Director of Software Engine...,"[{'degree': 'BSc, Computer Science', 'institut...",,,,,,,...,,,,0.0,,3.871014,5.883393,0.0,7.0,7.0
13333,c-tony-liu-1a4989104,-1.0,"[{'title': 'Head of Platform & co-founder', 'c...","[{'degree': 'Postgraduate Degree, Postdoctoral...",,,,,,,...,,,,0.0,,,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21236,yogesh-vishnoi-9917051bb,0.0,"[{'title': 'Penetration Tester', 'company_name...","[{'degree': ""Bachelor's degree, International ...",,,,,,,...,,,,0.0,,,,0.0,0.0,0.0
20881,saurabhshinde96,0.0,[{'title': 'Penetration Tester / Cyber Securit...,"[{'degree': 'Master of Information Technology,...",,,,,,,...,,,,0.0,,4.326827,5.987413,0.0,2.0,2.0
20543,robertobendana,0.0,"[{'title': 'Owner', 'company_name': 'Bendana C...",[{'degree': 'Certification in Executive & Team...,,,,,,,...,,,,0.0,2.264529,4.599600,5.783222,1.0,1.0,2.0
18864,tracy-bantegui-sweden-93455b48,0.0,"[{'title': 'Sales Director', 'company_name': '...","[{'degree': 'Dual B.S., Biology, Chemistry', '...",,,,,,,...,,,,0.0,,,,0.0,0.0,0.0


In [201]:
merged_df

Unnamed: 0,username,Success_after_5_years_x,experience_x,education_x,company_5_success_x,company_5_failure_x,company_5_amount_x,company_10_success_x,company_10_failure_x,company_10_amount_x,...,worked_in_vc_y,worked_in_consulting_y,university_prestige_y,match_count_y,company_prestige_fortune_y,company_prestige_forbes_y,company_prestige_5_10_y,match_count_fortune_y,match_count_forbes_y,match_count_5_10_y
0,1tonyaskew,0.0,"[{'title': 'Founder Partner', 'company_name': ...","[{'degree': 'Bachelor of Science (BSc), Physic...",,,,,,,...,False,False,4.294894,1.0,,8.513514,6.942845,0.0,1.0,1.0
1,1tonyaskew,0.0,"[{'title': 'Founder Partner', 'company_name': ...","[{'degree': 'Bachelor of Science (BSc), Physic...",,,,,,,...,False,False,4.294894,1.0,,8.513514,6.942845,0.0,1.0,1.0
2,amyakruse2020,0.0,[{'title': 'General Partner & Chief Investment...,"[{'degree': 'Ph.D., Neuroscience', 'institutio...",0.000000,0.000000,1.0,0.000000,0.000000,0.0,...,False,False,,0.0,,4.294294,5.979989,0.0,1.0,1.0
3,james-e-mccann,0.0,"[{'title': 'Founder & CEO', 'company_name': 'F...","[{'degree': 'Management science - UMIST, Busin...",0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,True,False,,0.0,,8.513514,6.942845,0.0,1.0,1.0
4,enkebashllari,0.0,"[{'title': 'Founder and Managing Director', 'c...",[{'degree': 'Master of Business Administration...,18.518519,10.582011,1.0,18.518519,10.582011,1.0,...,,,,0.0,,5.095095,6.162738,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,rajurishi,0.0,"[{'title': 'General Partner', 'company_name': ...","[{'degree': 'MS, Materials Science & Enginerri...",,,,,,,...,False,False,,0.0,8.216433,,6.875049,1.0,0.0,1.0
368,dane-mcdonald-2ba26850,0.0,"[{'title': 'Founder/CEO', 'company_name': 'Out...",[{'degree': 'Master of Business Administration...,4.316547,11.510791,1.0,4.316547,11.510791,1.0,...,False,True,,0.0,0.260521,2.177177,5.278150,1.0,1.0,2.0
369,dane-mcdonald-2ba26850,0.0,"[{'title': 'Founder/CEO', 'company_name': 'Out...",[{'degree': 'Master of Business Administration...,4.316547,11.510791,1.0,4.316547,11.510791,1.0,...,False,False,,0.0,0.260521,2.177177,5.278150,1.0,1.0,2.0
370,larrykaplan1,0.0,"[{'title': 'CEO & Principal', 'company_name': ...","[{'degree': 'J.D., Law', 'institution': 'Unive...",4.316547,11.510791,1.0,4.316547,11.510791,1.0,...,False,False,,0.0,,,,0.0,0.0,0.0


### 2.2 Data preparation for Text Feautre Extraction

In [None]:
# Train Data from first Split
train_combined_text_data = train_combined_data[['username', 'cleaned_posts', 'cleaned_comments', 'cleaned_reactions', 'about', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
# Test Data from first Split
test_combined_text_data = test_combined_data[['username', 'cleaned_posts', 'cleaned_comments', 'cleaned_reactions', 'about', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
# First Train Data from the second split for supervised feature creation
train_feature_combined_text_data = train_feature_combined_data[['username', 'cleaned_posts', 'cleaned_comments', 'cleaned_reactions', 'about', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
# Second Train Data from the second split for the final model including the features with supervised creation
train_final_prediction_combined_text_data = train_final_prediction_combined_data[['username', 'cleaned_posts', 'cleaned_comments', 'cleaned_reactions', 'about', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]


In [56]:
# Function to ensure values are converted to list format
def to_list(x):
    """
    Converts input values into a list format.

    Parameters:
    x (any): Input value that can be a string, list, or other types.

    Returns:
    list: A list representation of the input value. Returns an empty list for invalid inputs.
    """
    if isinstance(x, str):  # If the value is a string
        try:
            return ast.literal_eval(x)  # Safely evaluate strings like "[item1, item2]"
        except (ValueError, SyntaxError):  # Handle invalid string formats
            return []  # Return an empty list if evaluation fails
    elif isinstance(x, list):  # If the value is already a list
        return x
    else:  # For other types (e.g., NaN), return an empty list
        return []

# Apply the `to_list` function to the relevant columns in each DataFrame
# Process 'cleaned_reactions'
train_combined_text_data['cleaned_reactions'] = train_combined_text_data['cleaned_reactions'].apply(to_list)
test_combined_text_data['cleaned_reactions'] = test_combined_text_data['cleaned_reactions'].apply(to_list)
train_feature_combined_text_data['cleaned_reactions'] = train_feature_combined_text_data['cleaned_reactions'].apply(to_list)
train_final_prediction_combined_text_data['cleaned_reactions'] = train_final_prediction_combined_text_data['cleaned_reactions'].apply(to_list)

# Process 'cleaned_posts'
train_combined_text_data['cleaned_posts'] = train_combined_text_data['cleaned_posts'].apply(to_list)
test_combined_text_data['cleaned_posts'] = test_combined_text_data['cleaned_posts'].apply(to_list)
train_feature_combined_text_data['cleaned_posts'] = train_feature_combined_text_data['cleaned_posts'].apply(to_list)
train_final_prediction_combined_text_data['cleaned_posts'] = train_final_prediction_combined_text_data['cleaned_posts'].apply(to_list)

# Process 'cleaned_comments'
train_combined_text_data['cleaned_comments'] = train_combined_text_data['cleaned_comments'].apply(to_list)
test_combined_text_data['cleaned_comments'] = test_combined_text_data['cleaned_comments'].apply(to_list)
train_feature_combined_text_data['cleaned_comments'] = train_feature_combined_text_data['cleaned_comments'].apply(to_list)
train_final_prediction_combined_text_data['cleaned_comments'] = train_final_prediction_combined_text_data['cleaned_comments'].apply(to_list)

# Verify changes by displaying a sample from one DataFrame
print(train_combined_text_data[['cleaned_reactions', 'cleaned_posts', 'cleaned_comments']].head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_combined_text_data['cleaned_reactions'] = train_combined_text_data['cleaned_reactions'].apply(to_list)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_combined_text_data['cleaned_reactions'] = test_combined_text_data['cleaned_reactions'].apply(to_list)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versu

In [57]:
df = test_combined_text_data[test_combined_text_data['Success_after_5_years']==-1]
df[df['cleaned_reactions'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]


Unnamed: 0,username,cleaned_posts,cleaned_comments,cleaned_reactions,about,Success_after_5_years,equity_rounds_raised_after_5_years,Success_until_now
15814,laura-godfrey-902b811,"[I was lucky enough to get an advance copy, an...","[Amazing!, “How to be awesome” by Becky Freema...","[hashtag\n#RBTC's, I'm not optimistic that the...",I love building businesses. And I really love ...,-1,—,-1
15278,brett-danick-b1008730,[Let's go! See ya in October SF!\nbasement.stu...,[I absolutely loved getting this notification....,"[It's a big launch day for us at Knock, extend...",,-1,1,-1
13454,nipulpatel16,"[This week, we celebrate our 10th anniversary🎉...","[I know that logo on the hoodie!, Nick Patel, ...",[What a day! Congrats to my dear friend Kathy ...,Serent Capital invests in growing businesses w...,-1,3,-1
15112,mrtrickster,[],[Sounds quite logical and realistic. Combined ...,[Building an AI prototype every week isn't eas...,With 15 years of experience in the tech indust...,-1,4,-1
15276,ajit-deora-0042243,"[Glover Lawrence\nGlover Lawrence, I’m happy t...",[Congrats to the Moara team on their launch. B...,"[John Frechette, Yifei Xu and I are incredibly...",,-1,2,-1
...,...,...,...,...,...,...,...,...
13307,nicolechiu,[DreamCatchers is hiring! \n\nWhen I stepped i...,[So proud of the grit that you are demonstrati...,"[I’m going to, ""...no amount of community enga...","Prior to joining Google, Nicole was Co-Founder...",-1,2,-1
14676,rasyadiabdoellah,[I’m happy to share that I’m starting a new po...,"[Congrats Ras! 👏🏻👏🏻, 🙌🙌, Congrats Ras! Hope yo...",[I'm happy to share that I graduated with my B...,I'm a front-end developer and multidisciplinar...,-1,1,-1
14804,laurenprestifilippo,[So proud of the Uber Eats team winning 1st pl...,[],[Thrilled to share that I am starting a new ch...,,-1,1,-1
13914,eshaun,[Friends — How do you prefer to interact with ...,"[Congratulations, Maranda!!! 🎉, Congrats Percy...",[Want to stand out in Customer Success?\n\nGro...,Customer Success leader with over 15 years of ...,-1,1,-1


In [58]:
# Check the data type of 'cleaned_reactions' for rows with Success_after_5_years == -1
df = train_combined_text_data[train_combined_text_data['Success_after_5_years'] == -1]
print(df['cleaned_reactions'].head())  # Inspect sample values
print(df['cleaned_reactions'].apply(type).value_counts())  # Check data types


14798    [Exciting to move to the Digital heart of the ...
14399    [I'm delighted to share that I've joined Intui...
14053    [🚀 What a fantastic day at the Agentforce Worl...
13538    [Onwards to new Adventures! I’m happy to share...
13789    [Curve expands Asia operations as indie sector...
Name: cleaned_reactions, dtype: object
cleaned_reactions
<class 'list'>    994
Name: count, dtype: int64


#### Reaction Text Analysis

In [59]:
# Reaction Train data
train_reactions_data = train_combined_text_data[['username', 'cleaned_reactions', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
# Drop rows where 'cleaned_reactions' is an empty list
train_reactions_data = train_reactions_data[train_reactions_data['cleaned_reactions'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]
print(train_reactions_data[train_reactions_data['Success_after_5_years']==0][['cleaned_reactions', 'Success_after_5_years']])

# Explode the 'cleaned_reactions' column so each list item becomes a separate row
train_reactions_data = train_reactions_data.explode('cleaned_reactions')

# Ensure the 'cleaned_reactions' column is converted to strings (not lists)
train_reactions_data['cleaned_reactions'] = train_reactions_data['cleaned_reactions'].astype(str)

#Reaction Test data
test_reactions_data = test_combined_text_data[['username', 'cleaned_reactions', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
# Drop rows where 'cleaned_reactions' is an empty list
test_reactions_data = test_reactions_data[test_reactions_data['cleaned_reactions'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]

# Explode the 'cleaned_reactions' column so each list item becomes a separate row
test_reactions_data = test_reactions_data.explode('cleaned_reactions')

# Ensure the 'cleaned_reactions' column is converted to strings (not lists)
test_reactions_data['cleaned_reactions'] = test_reactions_data['cleaned_reactions'].astype(str)

reactions_df = pd.concat([train_reactions_data, test_reactions_data], ignore_index=True)
reactions_df


# Additional Split for Supervised Feature Training
# Reaction Train data
train_feature_reactions_data = train_feature_combined_text_data[['username', 'cleaned_reactions', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
# Drop rows where 'cleaned_reactions' is an empty list
train_feature_reactions_data = train_feature_reactions_data[train_feature_reactions_data['cleaned_reactions'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]

# Explode the 'cleaned_reactions' column so each list item becomes a separate row
train_feature_reactions_data = train_feature_reactions_data.explode('cleaned_reactions')

# Ensure the 'cleaned_reactions' column is converted to strings (not lists)
train_feature_reactions_data['cleaned_reactions'] = train_feature_reactions_data['cleaned_reactions'].astype(str)

                                       cleaned_reactions  \
6356   [You have (generally) two paths. The more you ...   
16301  [I have some super exciting news: Come Novembe...   
8835   [Échale un vistazo a uno de nuestros últimos t...   
19690  [In Vegas for, Imagine over 10,000 people in o...   
21602  [Feeling Grateful and Honored!\nI'm excited to...   
...                                                  ...   
16023  [What a Subarachnoid Hemorrhage (SAH) taught m...   
11363  [Wintermute is seeking to hire a talented Trad...   
14423  [2 years ago I had the idea for OpenSauced tha...   
11284  [Hi we are growing! We have a position for an ...   
21575  [45th done right! \n\nA heartfelt thank you to...   

       Success_after_5_years  
6356                       0  
16301                      0  
8835                       0  
19690                      0  
21602                      0  
...                      ...  
16023                      0  
11363                      0  


In [60]:
test_reactions_data[["username", "Success_after_5_years"]].groupby("Success_after_5_years").count()

Unnamed: 0_level_0,username
Success_after_5_years,Unnamed: 1_level_1
-1,3285
0,34775
1,2170


In [61]:
train_reactions_data[["username", "Success_after_5_years"]].groupby("Success_after_5_years").count()

Unnamed: 0_level_0,username
Success_after_5_years,Unnamed: 1_level_1
-1,7082
0,82332
1,5227


In [62]:
train_feature_reactions_data[["username", "Success_after_5_years"]].groupby("Success_after_5_years").count()

Unnamed: 0_level_0,username
Success_after_5_years,Unnamed: 1_level_1
-1,3765
0,41269
1,2437


In [63]:
train_reactions_data[train_reactions_data['Success_after_5_years']==-1][['cleaned_reactions', 'Success_after_5_years']]

Unnamed: 0,cleaned_reactions,Success_after_5_years
14798,Exciting to move to the Digital heart of the M...,-1
14798,D.A. Davidson & Co. announced today that it ha...,-1
14798,I'm thrilled to be starting a new position as ...,-1
14798,Happy Wednesday!\nJada Toys engaged Scrambled ...,-1
14798,I'm very excited to announce the closing of th...,-1
...,...,...
15787,There’s never been an easier time in history t...,-1
15787,This is such great news! Can't wait to start o...,-1
15787,There's a lot of debate on the ROI of research...,-1
15787,"""There are two essential dimensions of leaders...",-1


In [64]:
train_reactions_data.to_csv(r"C:\Users\Benja\Downloads\train_reactions_data.csv")
test_reactions_data.to_csv(r"C:\Users\Benja\Downloads\test_reactions_data.csv")
train_feature_reactions_data.to_csv(r"C:\Users\Benja\Downloads\train_feature_reactions_data.csv")

In [65]:
train_reactions_data

Unnamed: 0,username,cleaned_reactions,Success_after_5_years,equity_rounds_raised_after_5_years,Success_until_now
6356,josh-rickard,You have (generally) two paths. The more you l...,0,0,0
6356,josh-rickard,We're thrilled to present the Swimlane Anchor ...,0,0,0
6356,josh-rickard,We need more cybersecurity startups wiling to ...,0,0,0
6356,josh-rickard,"Hey everyone, the threat detection team at App...",0,0,0
6356,josh-rickard,"Yesterday, my tenure at Cofense ended unexpect...",0,0,0
...,...,...,...,...,...
21575,amitsri1008,💥IT'S THAT TIME.....the 2024 GSA Award Nominee...,0,1,1
21575,amitsri1008,I had a privilege of attending OPM62 batch Har...,0,1,1
21575,amitsri1008,On his 90th birth anniversary (first since his...,0,1,1
21575,amitsri1008,It has been very rewarding to be early stage f...,0,1,1


In [66]:
test_reactions_data

Unnamed: 0,username,cleaned_reactions,Success_after_5_years,equity_rounds_raised_after_5_years,Success_until_now
17116,joseph-chen-cmrp-691bb510,👏🏼 I’d like to give a big shoot out to the ent...,0,0,0
17116,joseph-chen-cmrp-691bb510,Incredible work by the whole team!\nLesley Haw...,0,0,0
17116,joseph-chen-cmrp-691bb510,Canada's Children's Hospital Foundations is HI...,0,0,0
17116,joseph-chen-cmrp-691bb510,I’m happy to share that I’m starting a new pos...,0,0,0
17116,joseph-chen-cmrp-691bb510,"As a coordinator at Sephora Gold, I’m grateful...",0,0,0
...,...,...,...,...,...
8354,steven-wasserman-10084b63,"What an exciting day it was, to celebrate our ...",0,1,0
8354,steven-wasserman-10084b63,In the immortal words of Marv Albért… \nYES AN...,0,1,0
8354,steven-wasserman-10084b63,"Something has authority in your life—work, spo...",0,1,0
8354,steven-wasserman-10084b63,"Planning a wedding is a big deal, but so is pl...",0,1,0


In [67]:
train_feature_reactions_data

Unnamed: 0,username,cleaned_reactions,Success_after_5_years,equity_rounds_raised_after_5_years,Success_until_now
1816,kousuke,先日、1年半務めたELSAのカントリーマネージャーを引退する運びとなりました。（引き続き、同...,0,1,0
1816,kousuke,Dear Network - \n\nI’m delighted to share that...,0,1,0
1816,kousuke,【経産省シリコンバレー拠点 “Japan Innovation Campus” コワーキング...,0,1,0
1816,kousuke,COVER Corporation is one of the world's larges...,0,1,0
1816,kousuke,COVER Corporation Announces hololive Meet 2024...,0,1,0
...,...,...,...,...,...
18190,amanda-negri-69b53763,More hardware for the hard work....and we ain'...,0,0,1
18190,amanda-negri-69b53763,Breaking news… Today we’re welcoming a new VP ...,0,0,1
18190,amanda-negri-69b53763,It was an incredible privilege to work with yo...,0,0,1
18190,amanda-negri-69b53763,Today’s office.,0,0,1


#### Posts Text Analysis

In [68]:
#train_combined_text_data['Success_after_5_years'] = train_combined_text_data['Success_after_5_years'].astype(int)

In [70]:
# Check the type of the first element in the column
print(type(test_combined_text_data[test_combined_text_data['Success_after_5_years'] ==-1]['cleaned_reactions'].iloc[1]))


<class 'list'>


In [71]:
# Reaction Train data
train_posts_data = train_combined_text_data[['username', 'cleaned_posts', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
# Drop rows where 'cleaned_reactions' is an empty list
#print(train_posts_data[train_posts_data['Success_after_5_years']==-1][['Success_after_5_years', 'cleaned_posts']])
train_posts_data = train_posts_data[train_posts_data['cleaned_posts'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]
#print(train_posts_data[train_posts_data['Success_after_5_years']==-1][['Success_after_5_years', 'cleaned_posts']])
#print(train_posts_data)
# Explode the 'cleaned_reactions' column so each list item becomes a separate row
train_posts_data = train_posts_data.explode('cleaned_posts')

# Ensure the 'cleaned_reactions' column is converted to strings (not lists)
train_posts_data['cleaned_posts'] = train_posts_data['cleaned_posts'].astype(str)

#Reaction Test data
test_posts_data = test_combined_text_data[['username', 'cleaned_posts', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
print(test_posts_data[test_posts_data['Success_after_5_years']==-1])

print(type(test_posts_data[test_posts_data['Success_after_5_years'] ==-1]['cleaned_posts'].iloc[1]))
# Drop rows where 'cleaned_posts' is an empty list
test_posts_data = test_posts_data[test_posts_data['cleaned_posts'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]
print(test_posts_data[test_posts_data['Success_after_5_years']==-1])

# Explode the 'cleaned_posts' column so each list item becomes a separate row
test_posts_data = test_posts_data.explode('cleaned_posts')
#print(test_posts_data[test_posts_data['Success_after_5_years']==0][['cleaned_reactions', 'Success_after_5_years']])

# Ensure the 'cleaned_posts' column is converted to strings (not lists)
test_posts_data['cleaned_posts'] = test_posts_data['cleaned_posts'].astype(str)

posts_df = pd.concat([train_posts_data, test_posts_data], ignore_index=True)
posts_df


# Additional Split for Supervised Feature Training
# Reaction Train data
train_feature_posts_data = train_feature_combined_text_data[['username', 'cleaned_posts', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
# Drop rows where 'cleaned_posts' is an empty list
train_feature_posts_data = train_feature_posts_data[train_feature_posts_data['cleaned_posts'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]

# Explode the 'cleaned_posts' column so each list item becomes a separate row
train_feature_posts_data = train_feature_posts_data.explode('cleaned_posts')

# Ensure the 'cleaned_posts' column is converted to strings (not lists)
train_feature_posts_data['cleaned_posts'] = train_feature_posts_data['cleaned_posts'].astype(str)

                         username  \
15700        melissajamesmorrison   
15814       laura-godfrey-902b811   
15278       brett-danick-b1008730   
15900  andrea-guendelman-78b32643   
13454                nipulpatel16   
...                           ...   
13580                   acarlsarv   
14804         laurenprestifilippo   
13914                      eshaun   
13552                feigelbinder   
14067                     kenray4   

                                           cleaned_posts  \
15700  [Recent coverage on the expansion of Family Of...   
15814  [I was lucky enough to get an advance copy, an...   
15278  [Let's go! See ya in October SF!\nbasement.stu...   
15900  [I am going to say something to female founder...   
13454  [This week, we celebrate our 10th anniversary🎉...   
...                                                  ...   
13580  [Ta chansen att söka ett spännande och utmanan...   
14804  [So proud of the Uber Eats team winning 1st pl...   
13914  [Friends

In [72]:
# Filter rows where 'cleaned_posts' is not empty and 'Success_after_5_years' is -1
filtered_data = train_combined_text_data[
    train_combined_text_data['Success_after_5_years'] == -1
].copy()

# Ensure 'cleaned_posts' is treated as a list and check if it's not empty
filtered_data = filtered_data[
    filtered_data['cleaned_posts'].apply(lambda x: isinstance(x, list) and len(x) > 0)
]

# Display the filtered data
filtered_data


Unnamed: 0,username,cleaned_posts,cleaned_comments,cleaned_reactions,about,Success_after_5_years,equity_rounds_raised_after_5_years,Success_until_now
14798,neal-applefeld-3175396,[Ready for pumpkin spice season! 🍂🎃 I just lit...,"[Megan. I’m so sorry for your loss., Geralyn. ...",[Exciting to move to the Digital heart of the ...,20 years of VP and C-level experience leading ...,-1,5,-1
14399,shannon-chambers,"[Fishbowl\nFishbowl\n17,912 followers\n17,912 ...","[Exciting stuff! They're lucky to have you., W...",[I'm delighted to share that I've joined Intui...,,-1,1,-1
14053,nagim-zamarialai-a8b26b95,[Great to be in San Diego! Energised for Sales...,"[Enjoy the event Wim de Jong Eric Magnuson, Aw...",[🚀 What a fantastic day at the Agentforce Worl...,Helping high-growth businesses transform the w...,-1,1,-1
13789,philipp-seifried,"[After 4 years of work, ""Dungeons of Hinterber...",[Etienne Rouzet-Davies 😷 Devcom/Gamescom Thank...,[Curve expands Asia operations as indie sector...,,-1,1,-1
15382,craigsturgis,[Happy annual planning season to all of you cu...,[Helped me find a polling place nearby with no...,[],I learned to write BASIC from a library book 1...,-1,1,-1
...,...,...,...,...,...,...,...,...
14820,nicolasgrasset,[We’re just getting started!\n\nSo lucky to be...,[],[Just jumped off the Shopify Q3 Earnings call....,"(I only connect with persons I have met, but f...",-1,3,-1
13773,anandrajaraman,[The last watch party was so much fun that we'...,"[Congrats Raghu! 🎉, Congrats Yichen, Congrats ...",[I’m excited to share that I’ve joined Andrees...,,-1,—,-1
15265,eddytse,[Filament Limited\nFilament Limited\n889 follo...,"[Send this book to me! Haha, Hi Matthew, thank...",[After months of meticulous preparation and fi...,Experienced Producer with a demonstrated histo...,-1,1,-1
15787,darin-suthapong-03b9739,[Hi all! Hato Hub is looking for an APM (and w...,"[Excited for your next move!, Wow! This is sup...",[📣 I'm excited to announce that after 6 years ...,,-1,1,-1


In [73]:
test_combined_text_data[["username", "Success_after_5_years"]].groupby("Success_after_5_years").count()

Unnamed: 0_level_0,username
Success_after_5_years,Unnamed: 1_level_1
-1,456
0,5802
1,306


In [74]:
test_posts_data[["username", "Success_after_5_years"]].groupby("Success_after_5_years").count()

Unnamed: 0_level_0,username
Success_after_5_years,Unnamed: 1_level_1
-1,3674
0,41343
1,2228


In [75]:
test_reactions_data[["username", "Success_after_5_years"]].groupby("Success_after_5_years").count()


Unnamed: 0_level_0,username
Success_after_5_years,Unnamed: 1_level_1
-1,3285
0,34775
1,2170


In [76]:
train_posts_data[["username", "Success_after_5_years"]].groupby("Success_after_5_years").count()

Unnamed: 0_level_0,username
Success_after_5_years,Unnamed: 1_level_1
-1,7908
0,98645
1,5602


In [None]:
train_posts_data=pd.read_csv(r"C:\Users\Benja\Downloads\train_posts_data.csv")
test_posts_data=pd.read_csv(r"C:\Users\Benja\Downloads\test_posts_data.csv")
train_feature_posts_data=pd.read_csv(r"C:\Users\Benja\Downloads\train_feature_posts_data.csv")

In [77]:
test_posts_data

Unnamed: 0,username,cleaned_posts,Success_after_5_years,equity_rounds_raised_after_5_years,Success_until_now
17116,joseph-chen-cmrp-691bb510,"While building my career in CPG in Canada, Can...",0,0,0
17116,joseph-chen-cmrp-691bb510,We're proud to welcome Megan Harris to the Leo...,0,0,0
17116,joseph-chen-cmrp-691bb510,Are collaborations the future of marketing in ...,0,0,0
17116,joseph-chen-cmrp-691bb510,Seed oils are under fire. A few thoughts on po...,0,0,0
17116,joseph-chen-cmrp-691bb510,"Move over avocados, pomegranates, and acai; da...",0,0,0
...,...,...,...,...,...
19323,nicholasshevelyov,Innovation in the UAE continues! Congratulatio...,0,1,1
19323,nicholasshevelyov,I look forward to participating!\nCOFENSE\nCOF...,0,1,1
19323,nicholasshevelyov,I’m happy to share that I’m starting a new pos...,0,1,1
19323,nicholasshevelyov,While Security Exchange Gateways (SEGs) are cr...,0,1,1


In [78]:
train_feature_posts_data

Unnamed: 0,username,cleaned_posts,Success_after_5_years,equity_rounds_raised_after_5_years,Success_until_now
1816,kousuke,Peter Spangler :)\nEmmanuel Job\nEmmanuel Job,0,1,0
16801,eamendez01,Hi All! Despite the horrible weather in New Yo...,0,0,0
16801,eamendez01,Firing on all cycliners! \n\nAnd big congrats ...,0,0,0
16801,eamendez01,This team of phenomenal individuals put togeth...,0,0,0
16801,eamendez01,"As many of you know, I have become very involv...",0,0,0
...,...,...,...,...,...
18190,amanda-negri-69b53763,MediaCrossing Inc.\nMediaCrossing Inc.\n969 fo...,0,0,1
18190,amanda-negri-69b53763,MediaCrossing Inc.\nMediaCrossing Inc.\n969 fo...,0,0,1
18190,amanda-negri-69b53763,MediaCrossing Inc.\nMediaCrossing Inc.\n969 fo...,0,0,1
18190,amanda-negri-69b53763,Marcum Top 40 event!\nMediaCrossing Inc.\nMedi...,0,0,1


#### Comments Text Analysis

In [79]:
# Reaction Train data
train_comments_data = train_combined_text_data[['username', 'cleaned_comments', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
# Drop rows where 'cleaned_comments' is an empty list
train_comments_data = train_comments_data[train_comments_data['cleaned_comments'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]

# Explode the 'cleaned_comments' column so each list item becomes a separate row
train_comments_data = train_comments_data.explode('cleaned_comments')

# Ensure the 'cleaned_comments' column is converted to strings (not lists)
train_comments_data['cleaned_comments'] = train_comments_data['cleaned_comments'].astype(str)

#Reaction Test data
test_comments_data = test_combined_text_data[['username', 'cleaned_comments', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
# Drop rows where 'cleaned_comments' is an empty list
test_comments_data = test_comments_data[test_comments_data['cleaned_comments'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]

# Explode the 'cleaned_comments' column so each list item becomes a separate row
test_comments_data = test_comments_data.explode('cleaned_comments')

# Ensure the 'cleaned_comments' column is converted to strings (not lists)
test_comments_data['cleaned_comments'] = test_comments_data['cleaned_comments'].astype(str)

comments_df = pd.concat([train_comments_data, test_comments_data], ignore_index=True)
comments_df


# Additional Split for Supervised Feature Training
# Reaction Train data
train_feature_comments_data = train_feature_combined_text_data[['username', 'cleaned_comments', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
# Drop rows where 'cleaned_comments' is an empty list
train_feature_comments_data = train_feature_comments_data[train_feature_comments_data['cleaned_comments'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]

# Explode the 'cleaned_comments' column so each list item becomes a separate row
train_feature_comments_data = train_feature_comments_data.explode('cleaned_comments')

# Ensure the 'cleaned_comments' column is converted to strings (not lists)
train_feature_comments_data['cleaned_comments'] = train_feature_comments_data['cleaned_comments'].astype(str)

In [80]:
train_comments_data.to_csv(r"C:\Users\Benja\Downloads\train_comments_data.csv")
test_comments_data.to_csv(r"C:\Users\Benja\Downloads\test_comments_data.csv")
train_feature_comments_data.to_csv(r"C:\Users\Benja\Downloads\train_feature_comments_data.csv")

In [81]:
train_comments_data[["username", "Success_after_5_years"]].groupby("Success_after_5_years").count()

Unnamed: 0_level_0,username
Success_after_5_years,Unnamed: 1_level_1
-1,7012
0,90906
1,5296


In [82]:
test_comments_data

Unnamed: 0,username,cleaned_comments,Success_after_5_years,equity_rounds_raised_after_5_years,Success_until_now
17116,joseph-chen-cmrp-691bb510,Congratulations!!!!👏,0,0,0
17116,joseph-chen-cmrp-691bb510,Catherine Chen “CC” You have an amazing produc...,0,0,0
17116,joseph-chen-cmrp-691bb510,Excited to attend this season and the show!,0,0,0
17116,joseph-chen-cmrp-691bb510,Congratulations Inigo García Alcalde!!!,0,0,0
17116,joseph-chen-cmrp-691bb510,Very excited for you!!! Congratulations 🎉,0,0,0
...,...,...,...,...,...
19323,nicholasshevelyov,Nick Shevelyov you haven't aged a day!,0,1,1
19323,nicholasshevelyov,Greg Martin very kind of you. Still trying to ...,0,1,1
19323,nicholasshevelyov,Thanks Nick Shevelyov for inviting me to parti...,0,1,1
19323,nicholasshevelyov,Richard Seiersen I like the sound of security ...,0,1,1


In [83]:
train_feature_comments_data

Unnamed: 0,username,cleaned_comments,Success_after_5_years,equity_rounds_raised_after_5_years,Success_until_now
1816,kousuke,Congrats Mayur!!!,0,1,0
1816,kousuke,ここはめっちゃ大きいと思ってます。後はその中で中国のポジションは下がるだろうなとも思ってます...,0,1,0
1816,kousuke,あれ、行こうかと思ってて忘れてました。。。\nShow translation\nShow ...,0,1,0
1816,kousuke,これはびっくりしましたよね。。。。バイデンとホワイトハウスの意思疎通ができていないとは思うん...,0,1,0
1816,kousuke,アメリカに来るの？\nShow translation\nShow translation ...,0,1,0
...,...,...,...,...,...
18740,nidhi-sachdeva-toronto,Fantastic post Umes Shrestha \nMy kids do TKD ...,0,0,0
18740,nidhi-sachdeva-toronto,Umes Shrestha Yup!.,0,0,0
18740,nidhi-sachdeva-toronto,The final point -- thinking about motivation b...,0,0,0
18740,nidhi-sachdeva-toronto,Anand Krishnaswamy It's possible. We just like...,0,0,0


#### About Text Analysis

In [84]:
# Reaction Train data
train_about_data = train_combined_text_data[['username', 'about', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
# Drop rows where 'about' is an empty string
train_about_data = train_about_data[train_about_data['about'].apply(lambda x: x.strip() != "" if isinstance(x, str) else False)]

# Ensure the 'about' column is converted to strings (not lists)
train_about_data['about'] = train_about_data['about'].astype(str)

#Reaction Test data
test_about_data = test_combined_text_data[['username', 'about', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
# Drop rows where 'about' is an empty string
test_about_data = test_about_data[test_about_data['about'].apply(lambda x: x.strip() != "" if isinstance(x, str) else False)]

# Ensure the 'about' column is converted to strings (not lists)
test_about_data['about'] = test_about_data['about'].astype(str)

about_df = pd.concat([train_about_data, test_about_data], ignore_index=True)
about_df


# Additional Split for Supervised Feature Training
# Reaction Train data
train_feature_about_data = train_feature_combined_text_data[['username', 'about', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Success_until_now']]
# Drop rows where 'about' is an empty list
train_feature_about_data = train_feature_about_data[train_feature_about_data['about'].apply(lambda x: x.strip() != "" if isinstance(x, str) else False)]
# Explode the 'about' column so each list item becomes a separate row
train_feature_about_data = train_feature_about_data.explode('about')

# Ensure the 'about' column is converted to strings (not lists)
train_feature_about_data['about'] = train_feature_about_data['about'].astype(str)

In [85]:
train_about_data.to_csv(r"C:\Users\Benja\Downloads\train_about_data.csv")
test_about_data.to_csv(r"C:\Users\Benja\Downloads\test_about_data.csv")
train_feature_about_data.to_csv(r"C:\Users\Benja\Downloads\train_feature_about_data.csv")

In [86]:
train_about_data

Unnamed: 0,username,about,Success_after_5_years,equity_rounds_raised_after_5_years,Success_until_now
6356,josh-rickard,"I am a creator of things, who loves to automat...",0,0,0
9317,farshad-kazemian-,.................. WHAT IF EATING MEAT COULD H...,0,1,0
19690,kannanmuthukkaruppan,"I have primarily worked on databases, large sc...",0,2,0
21602,puja-wadhawan-02880a3,"Dynamic, resourceful and ethical Executive Ass...",0,1,0
14084,kadiru,"As of Sep 2024, slowly coming out of a long te...",0,2,-1
...,...,...,...,...,...
11964,cory-combs-2b08b868,Building the future of electric aviation,0,0,0
21575,amitsri1008,Engineer at heart who has managed teams and ta...,0,1,1
5390,adamsharkawy,Adam is founding and managing partner of Mater...,0,3,0
860,jaqqui-posthumus-3ba7b2123,Country Head and Executive Director of Digitai...,0,0,0


In [87]:
test_about_data

Unnamed: 0,username,about,Success_after_5_years,equity_rounds_raised_after_5_years,Success_until_now
17116,joseph-chen-cmrp-691bb510,Joseph has 18 years of experience working on t...,0,0,0
21025,dylan-lawhon,I am currently working as an Independent Secur...,0,2,0
3115,victor-i,"With over 9 years of experience in fintech, bl...",0,0,0
14919,davidstavens,I build world class technology that improves h...,1,3,1
15700,melissajamesmorrison,Melissa James BA LLB (Hons) GAICD is a Partner...,-1,—,-1
...,...,...,...,...,...
6056,traciebrack,I have the privilege of working with global in...,0,1,0
2928,adam-norris-09528b149,Building the worlds best electric scooter bran...,0,1,0
14359,ryandenehy,3x entrepreneur with two exits to public compa...,1,2,1
11286,vasil-karpitski-29142619,- Business Development Management and Sales ma...,0,1,0


In [88]:
train_feature_about_data

Unnamed: 0,username,about,Success_after_5_years,equity_rounds_raised_after_5_years,Success_until_now
5499,jerry-nihen-18640a2,With a robust foundation in pre-sales engineer...,0,1,1
16801,eamendez01,"Skilled in mergers and acquisitions, restructu...",0,0,0
11291,%E2%9A%A1%EF%B8%8Falex-harris-7652069,#1 Developer Tool of the Week on Product Hunt\...,0,1,0
1036,nitinrai1,"Entrepreneur, CEO, Executive, Angel Investor a...",0,1,0
8130,businesscybershield,As the Founder and CEO / CTO of Business Cyber...,0,1,0
...,...,...,...,...,...
18634,melika-imoru-38259330,"A dynamic and passionate, result driven produc...",0,0,0
21032,jordan-m-b89b65253,Some stuff about me; looking for the carry so ...,0,2,0
13185,philippemnoel,ParadeDB: https://github.com/paradedb/paradedb...,-1,4,-1
9013,dixondoll,"For more than 35 years, Dixon has influenced a...",0,1,1


In [89]:
train_feature_about_data[["Success_after_5_years", "username"]].groupby("Success_after_5_years").count()

Unnamed: 0_level_0,username
Success_after_5_years,Unnamed: 1_level_1
-1,404
0,5330
1,245
