# Extracting features out of Posts, Comments, and Reactions

In [1]:
import pandas as pd
import ast
import re

In [2]:
final_combined_df = pd.read_csv(r"..\Final_Company_Dataset\filtered_closed_data_df.csv")

In [3]:
final_combined_df["Success_after_5_years"].unique()

array([-1,  1,  0], dtype=int64)

In [4]:
# Columns to convert from strings to lists
columns_to_convert = ["posts", "comments", "reactions"]

for column in columns_to_convert:
    final_combined_df[column] = final_combined_df[column].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
    )

## 1.Extracting Numerical Features and extracting Posts, Comments, and Reactions Texts

### Ectracting features out of Posts

In [5]:
# Function to extract the number of reactions, comments, and reposts from a post
def extract_post_details(post):
    """
    Extracts key engagement metrics (reactions, comments, and reposts) from a given post string.

    Parameters:
    post (str): The string content of the post.

    Returns:
    tuple: A tuple containing three integers:
        - main_number: The total number of reactions (e.g., likes, engagements).
        - comments_number: The total number of comments on the post.
        - reposts_number: The total number of reposts or shares of the post.
    """
    
    # Extract the main engagement number (e.g., likes, reactions)
    # This matches a pattern where a number is surrounded by newline characters (\n)
    main_number_match = re.search(r"\n(\d+)\n", post)
    main_number = int(main_number_match.group(1)) if main_number_match else 0  # Default to 0 if no match is found

    # Extract the number of comments
    # This matches the number following the main engagement number, followed by "comments"
    comments_match = re.search(rf"{re.escape(str(main_number))}\n(\d+)\scomments", post, re.IGNORECASE)
    comments_number = int(comments_match.group(1)) if comments_match else 0  # Default to 0 if no match is found

    # Extract the number of reposts
    # This matches the number following "comments" and preceding "reposts"
    reposts_match = re.search(r"comments\n(\d+)\sreposts?", post, re.IGNORECASE)
    reposts_number = int(reposts_match.group(1)) if reposts_match else 0  # Default to 0 if no match is found

    # Return the extracted metrics as a tuple
    return main_number, comments_number, reposts_number


In [6]:
# Function to convert time strings to days
def time_to_days(time_str):
    """
    Converts time strings like '2 years', '5 days', etc., into their equivalent number of days.

    Parameters:
    time_str (str): A string representing the time duration.

    Returns:
    int or None: The equivalent number of days as an integer, or None if no valid time format is found.
    """
    time_str = time_str.strip().lower()
    if "year" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 365
    elif "month" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 30
    elif "week" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 7
    elif "day" in time_str:
        return int(re.search(r"\d+", time_str).group())
    elif "hour" in time_str:
        return 1
    else:
        return None  # If no valid time format is found

# Function to clean posts and extract time information
def clean_posts(posts):
    """
    Cleans post content, counts reposts, and extracts time information in days.

    Parameters:
    posts (list): A list of post strings to process.

    Returns:
    tuple: A tuple containing:
        - cleaned_posts (list): A list of cleaned post content.
        - repost_count (int): The number of reposted posts.
        - post_times (list): A list of posting times converted to days.
    """
    cleaned_posts = []
    repost_count = 0  # To track how many posts are reposts
    post_times = []  # To store time information in days

    for post in posts:
        # Check if the post is a repost
        if "reposted this" in post.lower():
            repost_count += 1
            continue  # Skip reposted posts

        # Extract the time string (value before "ago\nFollow")
        time_match = re.search(r"(\d+\s*(?:year|month|week|day)s?)\s*ago\nfollow\n", post, re.IGNORECASE)
        if time_match:
            time_in_days = time_to_days(time_match.group(1))
            post_times.append(time_in_days)
        else:
            post_times.append(None)  # If no time found, append None

        # Extract content starting after "Follow"
        match = re.search(r"follow\n(.*)", post, re.IGNORECASE | re.DOTALL)
        if match:
            content = match.group(1)

            # Stop at the first occurrence of specific markers or patterns
            content = re.split(r"(\n…more|\nActivate to view larger image|\nDetails|\nNABIS|• 3rd+|\nChris Nguyen)", content, flags=re.IGNORECASE)[0]

            # Remove trailing "…more" and clean up any unnecessary whitespace
            content = re.sub(r"…more", "", content).strip()

            # Append the cleaned content to the list
            cleaned_posts.append(content)

    return cleaned_posts, repost_count, post_times

# Function to process the DataFrame
def process_dataframe(df):
    """
    Processes a DataFrame to clean posts, extract engagement metrics, and add new columns.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing a "posts" column.

    Returns:
    pd.DataFrame: A new DataFrame with additional columns for cleaned data and metrics.
    """
    cleaned_data = []  # To store results for all rows
    engagements_posts = []
    comments_on_posts = []
    reposts_number_posts = []
    post_times_column = []

    for _, row in df.iterrows():
        posts = row["posts"]  # Access the list of posts in the "posts" column

        # Clean posts and count reposts
        cleaned_posts, repost_count, post_times = clean_posts(posts)

        # Extract engagement details
        engagements = []
        comments = []
        reposts = []
        for post in posts:
            main_number, comments_number, reposts_number = extract_post_details(post)
            engagements.append(main_number)
            comments.append(comments_number)
            reposts.append(reposts_number)

        engagements_posts.append(engagements)
        comments_on_posts.append(comments)
        reposts_number_posts.append(reposts)
        post_times_column.append(post_times)

        cleaned_data.append({
            "cleaned_posts": cleaned_posts,
            "num_posts": len(posts),
            "num_reposted_posts": repost_count
        })

    # Create new DataFrame with additional columns
    cleaned_df = pd.DataFrame(cleaned_data)
    cleaned_df["engagements_posts"] = engagements_posts
    cleaned_df["comments_on_posts"] = comments_on_posts
    cleaned_df["reposts_number_posts"] = reposts_number_posts
    cleaned_df["post_times"] = post_times_column

    # Combine the original DataFrame with the cleaned data
    return pd.concat([df, cleaned_df], axis=1)

# Assuming `final_combined_df` is your original DataFrame
final_combined_df = process_dataframe(final_combined_df)

# Display the updated DataFrame
final_combined_df


Unnamed: 0.1,Unnamed: 0,Organization Name,Industry Groups,username,about,followers,posts,comments,reactions,experience,...,equity_rounds_raised_after_5_years,Equity_raised_until_now,Success_until_now,cleaned_posts,num_posts,num_reposted_posts,engagements_posts,comments_on_posts,reposts_number_posts,post_times
0,0,Netagenda;https://www.crunchbase.com/organizat...,"Consumer Goods,Health Care",melihvatansever,,"5,093 followers",[Feed post number 1\nMelih Vatansever\nMelih V...,[Feed post number 1\nMelih Vatansever replied ...,[Feed post number 1\nMelih Vatansever likes th...,"[{'title': 'Partner', 'company_name': 'VNTRS G...",...,5,"$1,180,378",-1,[We are beyond proud of you! 🐥🤵🏻‍♂️👏🏼👏🏼\nkreis...,11,10,"[20, 25, 87, 18, 26, 21, 29, 102, 15, 101, 28]","[0, 0, 10, 0, 2, 0, 0, 13, 0, 2, 0]","[0, 0, 21, 0, 1, 0, 0, 14, 0, 11, 0]",[120]
1,1,Hideout;https://www.crunchbase.com/organizatio...,"Food and Beverage,Transportation",chefdaniellesobel,,72 followers,[Feed post number 1\ndanielle sobel\ndanielle ...,[],[],"[{'title': 'Executive Chef', 'company_name': '...",...,1,"$150,000",-1,[☺️\n+4],1,0,[2],[0],[0],[365]
2,3,"Scaphold, Inc. (acquired by Amazon);https://ww...","Apps,Mobile,Software",vcning,,"12,296 followers",[Feed post number 1\nVince C. Ning\nVince C. N...,[Feed post number 1\nVince C.’s profile photo\...,[Feed post number 1\nVince C.’s profile photo\...,"[{'title': 'Founder & CEO', 'company_name': 'N...",...,4,"$145,000",-1,[This has been a long time coming.\n\nI’m very...,15,4,"[108, 107, 113, 300, 258, 69, 144, 135, 114, 2...","[5, 12, 4, 34, 10, 7, 3, 17, 12, 24, 2, 0, 0, ...","[1, 9, 1, 0, 13, 2, 5, 7, 0, 1, 0, 0, 0, 6, 3]","[21, 90, 90, 90, 180, 210, 210, 240, 270, 270,..."
3,4,Engine eCommerce;https://www.crunchbase.com/or...,"Commerce and Shopping,Consumer Electronics,Har...",bpuryear,"As a resourceful and driven professional, I ha...","2,532 followers",[Feed post number 1\nBlake Puryear reposted th...,[Feed post number 1\nBlake Puryear commented o...,[],[{'title': 'Director of Product: Merchant & Sh...,...,3,"$4,500,000",-1,[Incredibly excited about this new product lin...,12,9,"[99, 12, 21, 204, 77, 64, 245, 127, 36, 147, 6...","[4, 0, 0, 118, 0, 6, 17, 8, 4, 8, 4, 11]","[6, 0, 0, 89, 0, 10, 27, 17, 1, 6, 0, 37]","[30, 60, 150]"
4,7,PredictifyMe;https://www.crunchbase.com/organi...,"Artificial Intelligence (AI),Data and Analytic...",zusmani,With over 10 years of experience in data scien...,"169,714 followers","[Feed post number 1\nZeeshan Usmani, Ph.D\nZee...","[Feed post number 1\nZeeshan Usmani, Ph.D repl...",[],"[{'title': 'Co-Founder & CEO', 'company_name':...",...,2,"$1,250,000",-1,[Nice meeting Ambassador of Pakistan to the U....,8,0,"[611, 171, 265, 39, 37, 88, 291, 283]","[16, 4, 3, 4, 20, 8, 11, 80]","[0, 0, 0, 0, 0, 2, 2, 2]","[2, 2, 3, 3, 4, 4, 4, 5]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2770,5546,Particle Code;https://www.crunchbase.com/organ...,"Consumer Electronics,Hardware,Software",galia-benartzi-1a0aa220,"I am a technology entrepreneur, currently work...","5,108 followers",[],[Feed post number 1\nGalia Benartzi commented ...,[Feed post number 1\nGalia Benartzi likes this...,"[{'title': 'Co-Founder, Business Development',...",...,1,"$3,000,000",1,[],0,0,[],[],[],[]
2771,5547,Particle Code;https://www.crunchbase.com/organ...,"Consumer Electronics,Hardware,Software",yotamshacham,Game developer who become a technology entrepr...,"1,564 followers",[Feed post number 1\nYotam Shacham\nYotam Shac...,[Feed post number 1\nYotam Shacham replied to ...,[Feed post number 1\nYotam Shacham likes this\...,"[{'title': 'Chief Technology Officer', 'compan...",...,1,"$3,000,000",1,[Hello everyone! \n\nI hope this message finds...,5,1,"[38, 55, 50, 9, 4]","[7, 2, 5, 0, 0]","[0, 10, 0, 0, 0]","[270, 1095, 2920, 2920]"
2772,5548,Particle Code;https://www.crunchbase.com/organ...,"Consumer Electronics,Hardware,Software",yudi-levi-4bb91911,,"4,533 followers",[Feed post number 1\nYudi Levi\nYudi Levi\n • ...,[],[Feed post number 1\nYudi Levi likes this\nJon...,"[{'title': 'Chief Architect', 'company_name': ...",...,1,"$3,000,000",1,[Job Opportunity at Bancor\nbancor.network\n17...,1,0,[17],[0],[0],[2555]
2773,5549,myDocket;https://www.crunchbase.com/organizati...,"Advertising,Data and Analytics,Information Tec...",jasonwesbecher,Twenty three-year career as metrics-driven sal...,"3,743 followers",[Feed post number 1\nJason Wesbecher\nJason We...,[Feed post number 1\nJason Wesbecher commented...,[Feed post number 1\nJason Wesbecher likes thi...,"[{'title': 'Chief Executive Officer', 'company...",...,2,"$4,600,000",-1,[Had a fantastic time at my 4th Big Exit event...,11,1,"[92, 18, 43, 52, 41, 33, 102, 34, 11, 49, 27]","[6, 0, 0, 2, 0, 0, 4, 4, 0, 0, 0]","[1, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0]","[30, 60, 180, 210, 240, 270, 300, 300, 365, 365]"


In [7]:
print(final_combined_df["num_posts"].head())
print(final_combined_df["num_reposted_posts"].head())
print(final_combined_df.dtypes)


0    11
1     1
2    15
3    12
4     8
Name: num_posts, dtype: int64
0    10
1     0
2     4
3     9
4     0
Name: num_reposted_posts, dtype: int64
Unnamed: 0                             int64
Organization Name                     object
Industry Groups                       object
username                              object
about                                 object
followers                             object
posts                                 object
comments                              object
reactions                             object
experience                            object
education                             object
Founded Date                          object
equity_raised_in_5_years              object
Success_after_5_years                  int64
equity_rounds_raised_after_5_years    object
Equity_raised_until_now               object
Success_until_now                      int64
cleaned_posts                         object
num_posts                              in

### Extracting Features out of Comments

In [9]:
# Function to convert time strings to days
def time_to_days(time_str):
    """
    Converts time strings like '6mo', '2d' into equivalent days.

    Parameters:
    time_str (str): A time string indicating duration (e.g., '2d', '6mo').

    Returns:
    int or None: Number of days as an integer or None if the format is invalid.
    """
    if not time_str:
        return None
    time_map = {"y": 365, "mo": 30, "w": 7, "d": 1, "h": 1 / 24, "m": 1 / 1440, "s": 1 / 86400}
    match = re.match(r"(\d+)([a-z]+)", time_str.lower())  # Parse value and unit
    if match:
        value, unit = match.groups()
        return int(value) * time_map.get(unit, 0)
    return None

# Function to extract comments and their times from a post
def extract_comments(post, account_name):
    """
    Extracts comments and their associated times (in days) for a given account.

    Parameters:
    post (str): The post content.
    account_name (str): The account name to filter comments.

    Returns:
    tuple: A tuple containing:
        - comments (list): List of extracted comments.
        - times_in_days (list): List of times in days for the comments.
    """
    comments = []
    times_in_days = []

    # Define patterns for matching
    patterns = [
        rf"{re.escape(account_name)}(?:\s+\w+)?\n\s*•\s*3rd\+\n.*?\b(\d+[a-z]+)\b\n(.+?)(?:\n(?:Like|Reply|Collapse replies|Load more comments))",
        rf"{re.escape(account_name)}(?:,\s*[\w\.]+)*\s*\n\s*Author\n.*?\b(\d+[a-z]+)\b\n(.+?)(?:\n(?:Like|Reply|Collapse replies|Load more comments))",
    ]

    for pattern in patterns:
        matches = re.finditer(pattern, post, re.IGNORECASE | re.DOTALL)

        for match in matches:
            time_str = match.group(1)
            time_in_days = time_to_days(time_str)
            if time_in_days is not None:
                times_in_days.append(time_in_days)

            comment = match.group(2).strip()
            comments.append(comment)

    return comments, times_in_days

# Function to extract account name
def extract_account_name(post):
    """
    Extracts the account name from a post based on predefined patterns.

    Parameters:
    post (str): The post content.

    Returns:
    str or None: Extracted account name or None if not found.
    """
    match = re.search(r"s profile photo\n(.*?) commented on this", post, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    else:
        match = re.search(r"Feed post number \d+\n(.*?)(?:\sreplied|\scommented)", post, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    return None

# Process a list of comments
def process_comments_list(comments_list):
    """
    Processes a list of comments, extracting cleaned comments and their times.

    Parameters:
    comments_list (list): List of posts containing comments.

    Returns:
    tuple: A tuple containing:
        - cleaned_comments (list): Cleaned comments.
        - days_ago_list (list): Times of comments in days.
        - account_name (str): Extracted account name.
    """
    cleaned_comments = []
    days_ago_list = []
    account_name = None

    for post in comments_list:
        if not post:
            continue

        # Extract account name if not already set
        if account_name is None:
            account_name = extract_account_name(post)

        # Extract comments if account name is valid
        if account_name:
            comments, times_in_days = extract_comments(post, account_name)
            cleaned_comments.extend(comments)
            days_ago_list.extend(times_in_days)
    return cleaned_comments, days_ago_list, account_name

# Wrapper function to process a row
def process_row(row):
    """
    Processes a row to extract comments, count them, and compute posting times.

    Parameters:
    row (pd.Series): A row from the DataFrame.

    Returns:
    pd.Series: Series with additional columns for cleaned comments and related metrics.
    """
    cleaned_comments, days_ago, account_name = process_comments(row)
    return pd.Series({
        "cleaned_comments": cleaned_comments,
        "num_comments": len(cleaned_comments),
        "comment_days_ago": days_ago,
        "account_name": account_name
    })

# Extract engagement details
def extract_post_details(post):
    """
    Extracts details about reactions, comments, and reposts from a post.

    Parameters:
    post (str): The post content.

    Returns:
    tuple: A tuple containing the number of reactions, comments, and reposts.
    """
    main_number_match = re.search(r"\n(\d+)\n", post)
    main_number = int(main_number_match.group(1)) if main_number_match else 0

    comments_match = re.search(r"(\d+)\scomments", post, re.IGNORECASE)
    comments_number = int(comments_match.group(1)) if comments_match else 0

    reposts_match = re.search(r"comments\n(\d+)\sreposts?", post, re.IGNORECASE)
    reposts_number = int(reposts_match.group(1)) if reposts_match else 0

    return main_number, comments_number, reposts_number

# Process post details
def process_post_details(row):
    """
    Processes a row to extract reactions, comments, and reposts details.

    Parameters:
    row (pd.Series): A row from the DataFrame.

    Returns:
    pd.Series: Series with engagement details for the post.
    """
    try:
        comments_list = row["comments"]
        if isinstance(comments_list, list):
            engagements, comments_on_comments, reposts_number = zip(
                *(extract_post_details(post) for post in comments_list)
            )
            return pd.Series({
                "engagements_comments": list(engagements),
                "comments_on_comments": list(comments_on_comments),
                "reposts_number_comments": list(reposts_number),
            })
        return pd.Series({
            "engagements_comments": [],
            "comments_on_comments": [],
            "reposts_number_comments": [],
        })
    except Exception:
        return pd.Series({
            "engagements_comments": [],
            "comments_on_comments": [],
            "reposts_number_comments": [],
        })

# Apply the processing functions
final_combined_df[["cleaned_comments", "num_comments", "comment_days_ago", "account_name"]] = final_combined_df.apply(process_row, axis=1)
final_combined_df[["engagements_comments", "comments_on_comments", "reposts_number_comments"]] = final_combined_df.apply(process_post_details, axis=1)

# Display the DataFrame to verify the result
final_combined_df


Unnamed: 0.1,Unnamed: 0,Organization Name,Industry Groups,username,about,followers,posts,comments,reactions,experience,...,comments_on_posts,reposts_number_posts,post_times,cleaned_comments,num_comments,comment_days_ago,account_name,engagements_comments,comments_on_comments,reposts_number_comments
0,0,Netagenda;https://www.crunchbase.com/organizat...,"Consumer Goods,Health Care",melihvatansever,,"5,093 followers",[Feed post number 1\nMelih Vatansever\nMelih V...,[Feed post number 1\nMelih Vatansever replied ...,[Feed post number 1\nMelih Vatansever likes th...,"[{'title': 'Partner', 'company_name': 'VNTRS G...",...,"[0, 0, 10, 0, 2, 0, 0, 13, 0, 2, 0]","[0, 0, 21, 0, 1, 0, 0, 14, 0, 11, 0]",[120],[Congrats Dimitri Nabatov - Lead gen has begun...,12,"[180, 180, 180, 180, 180, 180, 210, 210, 210, ...",Melih Vatansever,"[133, 133, 329, 18, 329, 115, 19, 19]","[32, 32, 11, 3, 62, 16, 5, 5]","[0, 0, 1, 1, 0, 1, 2, 2]"
1,1,Hideout;https://www.crunchbase.com/organizatio...,"Food and Beverage,Transportation",chefdaniellesobel,,72 followers,[Feed post number 1\ndanielle sobel\ndanielle ...,[],[],"[{'title': 'Executive Chef', 'company_name': '...",...,[0],[0],[365],[],0,[],,[],[],[]
2,3,"Scaphold, Inc. (acquired by Amazon);https://ww...","Apps,Mobile,Software",vcning,,"12,296 followers",[Feed post number 1\nVince C. Ning\nVince C. N...,[Feed post number 1\nVince C.’s profile photo\...,[Feed post number 1\nVince C.’s profile photo\...,"[{'title': 'Founder & CEO', 'company_name': 'N...",...,"[5, 12, 4, 34, 10, 7, 3, 17, 12, 24, 2, 0, 0, ...","[1, 9, 1, 0, 13, 2, 5, 7, 0, 1, 0, 0, 0, 6, 3]","[21, 90, 90, 90, 180, 210, 210, 240, 270, 270,...",[Congrats Cory! It’s great to see Eaze continu...,9,"[0.5416666666666666, 7, 30, 150, 210, 240, 240...",Vince C. Ning,"[229, 10, 233, 215, 68, 122, 71, 253, 82]","[54, 0, 63, 43, 17, 14, 13, 23, 25]","[8, 38, 4, 0, 0, 1, 0, 0, 0]"
3,4,Engine eCommerce;https://www.crunchbase.com/or...,"Commerce and Shopping,Consumer Electronics,Har...",bpuryear,"As a resourceful and driven professional, I ha...","2,532 followers",[Feed post number 1\nBlake Puryear reposted th...,[Feed post number 1\nBlake Puryear commented o...,[],[{'title': 'Director of Product: Merchant & Sh...,...,"[4, 0, 0, 118, 0, 6, 17, 8, 4, 8, 4, 11]","[6, 0, 0, 89, 0, 10, 27, 17, 1, 6, 0, 37]","[30, 60, 150]","[Relentless optimism -- often, stuff doesn't w...",9,"[28, 28, 28, 30, 60, 90, 90, 120, 120]",Blake Puryear,"[35, 45, 28, 43, 65, 43, 30, 236, 339]","[23, 14, 10, 14, 28, 3, 8, 53, 133]","[0, 11, 0, 3, 3, 1, 0, 0, 0]"
4,7,PredictifyMe;https://www.crunchbase.com/organi...,"Artificial Intelligence (AI),Data and Analytic...",zusmani,With over 10 years of experience in data scien...,"169,714 followers","[Feed post number 1\nZeeshan Usmani, Ph.D\nZee...","[Feed post number 1\nZeeshan Usmani, Ph.D repl...",[],"[{'title': 'Co-Founder & CEO', 'company_name':...",...,"[16, 4, 3, 4, 20, 8, 11, 80]","[0, 0, 0, 0, 0, 2, 2, 2]","[2, 2, 3, 3, 4, 4, 4, 5]",[Zeeshan. You don't have to address him as H.E...,17,"[2, 2, 2, 2, 7, 7, 14, 14, 14, 14, 14, 21, 21,...","Zeeshan Usmani, Ph.D","[611, 587, 641, 55, 70, 308, 258, 115, 562, 2,...","[16, 164, 284, 6, 10, 109, 91, 9, 27, 0, 194, ...","[0, 2, 0, 0, 4, 0, 1, 1, 5, 46, 0, 1, 34, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2770,5546,Particle Code;https://www.crunchbase.com/organ...,"Consumer Electronics,Hardware,Software",galia-benartzi-1a0aa220,"I am a technology entrepreneur, currently work...","5,108 followers",[],[Feed post number 1\nGalia Benartzi commented ...,[Feed post number 1\nGalia Benartzi likes this...,"[{'title': 'Co-Founder, Business Development',...",...,[],[],[],[Been working with Itay for years and happy to...,2,"[1460, 1825]",Galia Benartzi,"[50, 7]","[11, 2]","[16, 0]"
2771,5547,Particle Code;https://www.crunchbase.com/organ...,"Consumer Electronics,Hardware,Software",yotamshacham,Game developer who become a technology entrepr...,"1,564 followers",[Feed post number 1\nYotam Shacham\nYotam Shac...,[Feed post number 1\nYotam Shacham replied to ...,[Feed post number 1\nYotam Shacham likes this\...,"[{'title': 'Chief Technology Officer', 'compan...",...,"[7, 2, 5, 0, 0]","[0, 10, 0, 0, 0]","[270, 1095, 2920, 2920]",[Yuyi Kitano Lum is an email deliverability ex...,11,"[240, 240, 240, 240, 240, 240, 365, 365, 365, ...",Yotam Shacham,"[38, 38, 100, 93, 87, 98, 173, 148, 49]","[7, 7, 10, 12, 7, 20, 34, 25, 5]","[0, 0, 0, 0, 0, 0, 3, 0, 0]"
2772,5548,Particle Code;https://www.crunchbase.com/organ...,"Consumer Electronics,Hardware,Software",yudi-levi-4bb91911,,"4,533 followers",[Feed post number 1\nYudi Levi\nYudi Levi\n • ...,[],[Feed post number 1\nYudi Levi likes this\nJon...,"[{'title': 'Chief Architect', 'company_name': ...",...,[0],[0],[2555],[],0,[],,[],[],[]
2773,5549,myDocket;https://www.crunchbase.com/organizati...,"Advertising,Data and Analytics,Information Tec...",jasonwesbecher,Twenty three-year career as metrics-driven sal...,"3,743 followers",[Feed post number 1\nJason Wesbecher\nJason We...,[Feed post number 1\nJason Wesbecher commented...,[Feed post number 1\nJason Wesbecher likes thi...,"[{'title': 'Chief Executive Officer', 'company...",...,"[6, 0, 0, 2, 0, 0, 4, 4, 0, 0, 0]","[1, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0]","[30, 60, 180, 210, 240, 270, 300, 300, 365, 365]",[I miss you Gio!],1,[365],Jason Wesbecher,"[169, 150, 261, 11, 168, 22, 174, 222]","[41, 63, 32, 13, 74, 5, 55, 61]","[2, 0, 1, 1, 1, 0, 0, 0]"


### Extract Features from Reactions

In [10]:
# Function to convert time strings to days
def time_to_days(time_str):
    """
    Converts time strings like '1 year', '2 months', or '3 days' into equivalent days.

    Parameters:
    time_str (str): A string representing time duration.

    Returns:
    int or None: The equivalent number of days or None if the format is invalid.
    """
    time_str = time_str.strip().lower()
    if "year" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 365
    elif "month" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 30
    elif "week" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 7
    elif "day" in time_str:
        return int(re.search(r"\d+", time_str).group())
    elif "hour" in time_str or "minute" in time_str or "second" in time_str:
        return 1  # Less than a day counts as 1 day
    return None

# Function to extract the reaction type
def extract_reaction(post, account_name):
    """
    Extracts the type of reaction for a given account name from a post.

    Parameters:
    post (str): The post content.
    account_name (str): The account name whose reaction is to be extracted.

    Returns:
    str or None: The reaction type or None if no reaction is found.
    """
    if not post or not account_name:
        return None
    match = re.search(rf"{re.escape(account_name)}\s+(likes this|finds this insightful|supports this|comments on this)", 
                      post, re.IGNORECASE)
    return match.group(1).strip() if match else None

# Function to extract the post text
def extract_post_text(post):
    """
    Extracts post text between '\nFollow\n' and the first '\n…more' or '\nhashtag'.

    Parameters:
    post (str): The post content.

    Returns:
    str or None: Extracted post text or None if no match is found.
    """
    if not post:
        return None
    match = re.search(r"\nFollow\n(.*?)(?:\n…more|\nhashtag)", post, re.DOTALL)
    return match.group(1).strip() if match else None

# Function to extract the time in days
def extract_time_in_days(post):
    """
    Extracts the time before '\nFollow\n' and converts it into days.

    Parameters:
    post (str): The post content.

    Returns:
    int or None: The equivalent time in days or None if no valid format is found.
    """
    if not post:
        return None
    match = re.search(r"•?\s*\n\s*(\d+\s*[a-z]+)\s+ago\s+Follow\n", post, re.IGNORECASE)
    return time_to_days(match.group(1).strip()) if match else None

# Function to extract hashtags
def extract_hashtags(post):
    """
    Extracts hashtags from the post content.

    Parameters:
    post (str): The post content.

    Returns:
    list: A list of extracted hashtags without the '#' prefix.
    """
    if not post:
        return []
    match = re.search(r"\nhashtag\n(#[^\n]+)", post, re.DOTALL)
    if match:
        hashtags = match.group(1).split("\n")
        return [tag.lstrip("#") for tag in hashtags if tag.startswith("#")]
    return []

# Process each row for reactions and related details
def process_row(row):
    """
    Processes a row to extract cleaned reactions, reaction types, hashtags, and times.

    Parameters:
    row (pd.Series): A DataFrame row containing 'reactions' and 'account_name'.

    Returns:
    pd.Series: A Series with additional columns for reactions and related details.
    """
    reactions_list = row.get("reactions", [])
    account_name = row.get("account_name")
    
    cleaned_reactions = []
    reaction_types = []
    hashtag_lists = []
    post_times = []
    
    for reaction_post in reactions_list:
        if not reaction_post:
            continue
        post_text = extract_post_text(reaction_post)
        reaction_type = extract_reaction(reaction_post, account_name)
        hashtags = extract_hashtags(reaction_post)
        time_in_days = extract_time_in_days(reaction_post)
        
        if post_text:
            cleaned_reactions.append(post_text)
        if reaction_type:
            reaction_types.append(reaction_type)
        hashtag_lists.append(hashtags)
        if time_in_days is not None:
            post_times.append(time_in_days)
    
    return pd.Series({
        "cleaned_reactions": cleaned_reactions,
        "reaction_types": reaction_types,
        "hashtag_lists": hashtag_lists,
        "new_num_posts": len(reactions_list),
        "new_post_times": post_times
    })

# Function to extract engagement metrics
def extract_post_details(post):
    """
    Extracts metrics like reactions, comments, and reposts from a post.

    Parameters:
    post (str): The post content.

    Returns:
    tuple: A tuple containing the number of reactions, comments, and reposts.
    """
    main_number_match = re.search(r"\n(\d+)\n", post)
    main_number = int(main_number_match.group(1)) if main_number_match else 0

    comments_match = re.search(r"(\d+)\scomments", post, re.IGNORECASE)
    comments_number = int(comments_match.group(1)) if comments_match else 0

    reposts_match = re.search(r"comments\n(\d+)\sreposts?", post, re.IGNORECASE)
    reposts_number = int(reposts_match.group(1)) if reposts_match else 0

    return main_number, comments_number, reposts_number

# Process each row to extract engagement metrics
def process_reactions_details(row):
    """
    Processes a row to extract reaction engagement metrics.

    Parameters:
    row (pd.Series): A DataFrame row containing 'reactions'.

    Returns:
    pd.Series: A Series with columns for reactions, comments, and reposts.
    """
    reactions_list = row.get("reactions", [])
    if isinstance(reactions_list, list):
        engagements, comments, reposts = zip(
            *(extract_post_details(post) for post in reactions_list if post)
        ) if reactions_list else ([], [], [])
        return pd.Series({
            "engagements_reactions": list(engagements),
            "comments_on_reactions": list(comments),
            "reposts_number_reactions": list(reposts),
        })
    return pd.Series({
        "engagements_reactions": [],
        "comments_on_reactions": [],
        "reposts_number_reactions": [],
    })

# Apply processing functions to the DataFrame
temp_result = final_combined_df.apply(process_row, axis=1)
final_combined_df["cleaned_reactions"] = temp_result["cleaned_reactions"]
final_combined_df["reaction_types"] = temp_result["reaction_types"]
final_combined_df["hashtag_lists"] = temp_result["hashtag_lists"]
final_combined_df["num_posts"] = temp_result["new_num_posts"]
final_combined_df["reaction_times"] = temp_result["new_post_times"]

# Add columns for engagement metrics
reactions_details = final_combined_df.apply(process_reactions_details, axis=1)
final_combined_df["engagements_reactions"] = reactions_details["engagements_reactions"]
final_combined_df["comments_on_reactions"] = reactions_details["comments_on_reactions"]
final_combined_df["reposts_number_reactions"] = reactions_details["reposts_number_reactions"]

# Display the updated DataFrame
final_combined_df


Unnamed: 0.1,Unnamed: 0,Organization Name,Industry Groups,username,about,followers,posts,comments,reactions,experience,...,engagements_comments,comments_on_comments,reposts_number_comments,cleaned_reactions,reaction_types,hashtag_lists,reaction_times,engagements_reactions,comments_on_reactions,reposts_number_reactions
0,0,Netagenda;https://www.crunchbase.com/organizat...,"Consumer Goods,Health Care",melihvatansever,,"5,093 followers",[Feed post number 1\nMelih Vatansever\nMelih V...,[Feed post number 1\nMelih Vatansever replied ...,[Feed post number 1\nMelih Vatansever likes th...,"[{'title': 'Partner', 'company_name': 'VNTRS G...",...,"[133, 133, 329, 18, 329, 115, 19, 19]","[32, 32, 11, 3, 62, 16, 5, 5]","[0, 0, 1, 1, 0, 1, 2, 2]",[NORMAL für Rolf Hänggi:\nMaking things better...,"[likes this, finds this insightful, likes this...","[[], [], [], [], [], [], [], [], [], [], []]","[1, 730, 1095, 4, 5, 42, 49, 56, 63, 70, 77]","[34, 53, 103, 231, 93, 27, 1, 5, 9, 0, 0]","[9, 14, 9, 15, 2, 0, 0, 0, 0, 54, 0]","[5, 6, 4, 4, 1, 56, 68, 68, 68, 68, 112]"
1,1,Hideout;https://www.crunchbase.com/organizatio...,"Food and Beverage,Transportation",chefdaniellesobel,,72 followers,[Feed post number 1\ndanielle sobel\ndanielle ...,[],[],"[{'title': 'Executive Chef', 'company_name': '...",...,[],[],[],[],[],[],[],[],[],[]
2,3,"Scaphold, Inc. (acquired by Amazon);https://ww...","Apps,Mobile,Software",vcning,,"12,296 followers",[Feed post number 1\nVince C. Ning\nVince C. N...,[Feed post number 1\nVince C.’s profile photo\...,[Feed post number 1\nVince C.’s profile photo\...,"[{'title': 'Founder & CEO', 'company_name': 'N...",...,"[229, 10, 233, 215, 68, 122, 71, 253, 82]","[54, 0, 63, 43, 17, 14, 13, 23, 25]","[8, 38, 4, 0, 0, 1, 0, 0, 0]",[Eaze Inc. is announcing a $10 million Series ...,"[supports this, supports this, likes this, lik...","[[], [], [], [], [], [], [], [], [], [], [], [...","[365, 2, 1095, 1460, 35, 42, 210, 56, 3285, 70...","[229, 118, 632, 213, 0, 144, 135, 17, 52, 637,...","[54, 12, 11, 22, 0, 10, 31, 3, 3, 91, 0, 0, 0,...","[8, 1, 2, 0, 27, 5, 1, 0, 2, 18, 38, 38, 65, 8]"
3,4,Engine eCommerce;https://www.crunchbase.com/or...,"Commerce and Shopping,Consumer Electronics,Har...",bpuryear,"As a resourceful and driven professional, I ha...","2,532 followers",[Feed post number 1\nBlake Puryear reposted th...,[Feed post number 1\nBlake Puryear commented o...,[],[{'title': 'Director of Product: Merchant & Sh...,...,"[35, 45, 28, 43, 65, 43, 30, 236, 339]","[23, 14, 10, 14, 28, 3, 8, 53, 133]","[0, 11, 0, 3, 3, 1, 0, 0, 0]",[],[],[],[],[],[],[]
4,7,PredictifyMe;https://www.crunchbase.com/organi...,"Artificial Intelligence (AI),Data and Analytic...",zusmani,With over 10 years of experience in data scien...,"169,714 followers","[Feed post number 1\nZeeshan Usmani, Ph.D\nZee...","[Feed post number 1\nZeeshan Usmani, Ph.D repl...",[],"[{'title': 'Co-Founder & CEO', 'company_name':...",...,"[611, 587, 641, 55, 70, 308, 258, 115, 562, 2,...","[16, 164, 284, 6, 10, 109, 91, 9, 27, 0, 194, ...","[0, 2, 0, 0, 4, 0, 1, 1, 5, 46, 0, 1, 34, 0]",[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2770,5546,Particle Code;https://www.crunchbase.com/organ...,"Consumer Electronics,Hardware,Software",galia-benartzi-1a0aa220,"I am a technology entrepreneur, currently work...","5,108 followers",[],[Feed post number 1\nGalia Benartzi commented ...,[Feed post number 1\nGalia Benartzi likes this...,"[{'title': 'Co-Founder, Business Development',...",...,"[50, 7]","[11, 2]","[16, 0]",[Super excited to finally announce that I've j...,"[likes this, likes this, likes this, likes thi...","[[], [], [defi decentralized finance, many of ...","[30, 730, 1095, 1460, 1825, 3285, 3650, 4015, ...","[35, 402, 92, 6, 50, 95, 95, 95, 5, 101, 20, 27]","[0, 57, 18, 0, 11, 11, 11, 11, 0, 4, 2, 0]","[0, 10, 5, 0, 16, 0, 0, 0, 0, 3, 0, 0]"
2771,5547,Particle Code;https://www.crunchbase.com/organ...,"Consumer Electronics,Hardware,Software",yotamshacham,Game developer who become a technology entrepr...,"1,564 followers",[Feed post number 1\nYotam Shacham\nYotam Shac...,[Feed post number 1\nYotam Shacham replied to ...,[Feed post number 1\nYotam Shacham likes this\...,"[{'title': 'Chief Technology Officer', 'compan...",...,"[38, 38, 100, 93, 87, 98, 173, 148, 49]","[7, 7, 10, 12, 7, 20, 34, 25, 5]","[0, 0, 0, 0, 0, 0, 3, 0, 0]",[🚀 Revolutionizing Medical Malpractice Insuran...,"[likes this, likes this, likes this, likes thi...","[[], [], [], [], [Ozempic isn't without seriou...","[30, 730, 90, 120, 150, 180, 210, 240, 270, 30...","[13, 98, 18, 61, 22, 26, 0, 25, 27, 26, 14, 15...","[0, 2, 0, 15, 0, 0, 0, 2, 0, 0, 0, 21, 99, 20,...","[0, 10, 0, 0, 0, 0, 29, 6, 0, 0, 0, 4, 29, 0, 0]"
2772,5548,Particle Code;https://www.crunchbase.com/organ...,"Consumer Electronics,Hardware,Software",yudi-levi-4bb91911,,"4,533 followers",[Feed post number 1\nYudi Levi\nYudi Levi\n • ...,[],[Feed post number 1\nYudi Levi likes this\nJon...,"[{'title': 'Chief Architect', 'company_name': ...",...,[],[],[],[Calcalist כלכליסט picked up my LinkedIn post ...,[],[[]],[365],[42],[2],[1]
2773,5549,myDocket;https://www.crunchbase.com/organizati...,"Advertising,Data and Analytics,Information Tec...",jasonwesbecher,Twenty three-year career as metrics-driven sal...,"3,743 followers",[Feed post number 1\nJason Wesbecher\nJason We...,[Feed post number 1\nJason Wesbecher commented...,[Feed post number 1\nJason Wesbecher likes thi...,"[{'title': 'Chief Executive Officer', 'company...",...,"[169, 150, 261, 11, 168, 22, 174, 222]","[41, 63, 32, 13, 74, 5, 55, 61]","[2, 0, 1, 1, 1, 0, 0, 0]",[My sales team invited me to three sales calls...,"[likes this, likes this, likes this, likes thi...","[[], [], [], [], [], [], [], [], [], [], [], [...","[7, 730, 21, 120, 150, 180, 210, 240, 270, 365...","[56, 509, 66, 124, 370, 206, 153, 455, 455, 25...","[3, 160, 0, 26, 79, 21, 16, 103, 103, 97, 9, 3...","[0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 27]"


In [11]:
final_combined_df.at[2, "reactions"][2]

"Feed post number 3\nVince C.’s profile photo\nVince C. Ning likes this\nTim Barash\nTim Barash\n • 3rd+\n • 3rd+\nDutchie\nDutchie\n4d • Edited • \n 4 days ago\nFollow\nAs of today, Toast is worth ~1/2 as much as Square's parent company, likely somewhere around what the entire Square Seller/B2B ecosystem valuation is. 5-10 years ago this would have been a completely insane thought to have and proves how far vertical SaaS has come. Feel so lucky to have bumped intoand so many others that helped us succeed over the years.\n\nTo the next generation of companies powering main street + vertical industries, you're on the right track and can build huge generational companies that make a big difference to the customers and industries you support. Excited to see what the next wave looks like especially with (buzzwords incoming) AI being another major accelerant beyond fintech for vertical tech companies.\n…more\n632\n11 comments\n2 reposts\nLike\nComment\nRepost\nSend"

In [13]:
final_combined_df.columns

Index(['Unnamed: 0', 'Organization Name', 'Industry Groups', 'username',
       'about', 'followers', 'posts', 'comments', 'reactions', 'experience',
       'education', 'Founded Date', 'equity_raised_in_5_years',
       'Success_after_5_years', 'equity_rounds_raised_after_5_years',
       'Equity_raised_until_now', 'Success_until_now', 'cleaned_posts',
       'num_posts', 'num_reposted_posts', 'engagements_posts',
       'comments_on_posts', 'reposts_number_posts', 'post_times',
       'cleaned_comments', 'num_comments', 'comment_days_ago', 'account_name',
       'engagements_comments', 'comments_on_comments',
       'reposts_number_comments', 'cleaned_reactions', 'reaction_types',
       'hashtag_lists', 'reaction_times', 'engagements_reactions',
       'comments_on_reactions', 'reposts_number_reactions'],
      dtype='object')

In [14]:
final_combined_df[['Organization Name']].nunique()

Organization Name    1604
dtype: int64

### Extract Numnber of Followers

In [15]:
# Remove the "followers" text and commas, then convert the column to integers
final_combined_df['followers'] = final_combined_df['followers'] \
    .str.replace('followers', '', regex=False) \  # Remove the word "followers" from the string
    .str.replace(',', '', regex=False) \          # Remove commas for proper numeric conversion
    .astype(int)                                  # Convert the cleaned strings to integers

# Verify the conversion by checking unique values in the "Success_after_5_years" column
unique_values = final_combined_df["Success_after_5_years"].unique()  # Get unique values
print(unique_values)  # Print the unique values to verify



array([-1,  1,  0], dtype=int64)

In [16]:
final_combined_df_first_half = pd.read_csv(r"..\Final_Company_Dataset\final_combined_df_first_half1.csv")
final_combined_df_first_half = final_combined_df_first_half[['Organization Name', 'username', 'about', 'followers',
       'posts', 'comments', 'reactions', 'experience', 'education',
       'Founded Date', 'equity_raised_in_5_years', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Equity_raised_until_now',
       'Success_until_now', 'cleaned_posts', 'num_posts', 'num_reposted_posts',
       'post_times', 'engagements_posts', 'comments_on_posts',
       'reposts_number_posts', 'cleaned_comments', 'num_comments',
       'comment_days_ago', 'engagements_comments', 'comments_on_comments',
       'reposts_number_comments', 'account_name', 'cleaned_reactions',
       'reaction_types', 'reaction_times']]

In [17]:
final_combined_df_first_half

Unnamed: 0,Organization Name,username,about,followers,posts,comments,reactions,experience,education,Founded Date,...,cleaned_comments,num_comments,comment_days_ago,engagements_comments,comments_on_comments,reposts_number_comments,account_name,cleaned_reactions,reaction_types,reaction_times
0,ORA Graphene Audio Inc.,ari-pinkas-88913811,Ari Pinkas is Co-founder and VP Business Devel...,1543,[],[],[],"[{'title': 'Co-Founder', 'company_name': 'ORA ...","[{'degree': 'BComm, Marketing', 'institution':...",2016,...,[],0,[],[],[],[],,[],[],[]
1,ORA Graphene Audio Inc.,kaiwen-hu-ph-d-a32a0946,- PhD in Materials Engineering. Extensive Expe...,725,"['Feed post number 1\nKaiwen Hu, Ph.D\nKaiwen ...","['Feed post number 1\nKaiwen Hu, Ph.D commente...",[],"[{'title': 'Co-founder VP Research', 'company_...","[{'degree': 'Doctor of Philosophy (Ph.D.), Mat...",2016,...,['Best of luck Rune!'],1,[365],[82],[19],[0],"Kaiwen Hu, Ph.D",[],[],[]
2,ORA Graphene Audio Inc.,michaelkraft,CEO / President / Independent Board Director /...,3729,['Feed post number 1\nMichael Kraft\nMichael K...,"[""Feed post number 1\nMichael Kraft replied to...",['Feed post number 1\nMichael Kraft likes this...,"[{'title': 'Chief Executive Officer', 'company...","[{'degree': 'ExecEd, Technology Marketing & St...",2016,...,['CONGRATS on the CEO role! We may need to tal...,10,"[90, 90, 90, 90, 90, 90, 90, 90, 90, 90]","[28, 71, 28, 68, 243, 85, 66, 108]","[21, 6, 21, 11, 116, 7, 61, 4]","[0, 1, 0, 8, 0, 3, 1, 2]",Michael Kraft,"['Over the past week, I\'ve heard the same thi...","['likes this', 'likes this', 'likes this', 'fi...","[7, 7, 60, 60, 120, 180, 300, 270, 300, 365, 3..."
3,ORA Graphene Audio Inc.,helgeseetzen,My journey took me from tech founder (BrightSi...,4925,['Feed post number 1\nHelge Seetzen\nHelge See...,['Feed post number 1\nHelge Seetzen replied to...,[],"[{'title': 'Managing Partner & CEO', 'company_...","[{'degree': 'PhD, Physics & Computer Science (...",2016,...,"['Congratulations! Bravo to the whole team!!',...",15,"[90, 90, 120, 120, 120, 120, 120, 120, 120, 12...","[465, 70, 465, 465, 465, 465, 465, 465]","[75, 5, 75, 75, 75, 75, 75, 75]","[7, 0, 7, 7, 7, 7, 7, 7]",Helge Seetzen,[],[],[]
4,ORA Graphene Audio Inc.,regaskell,Robert-Eric Gaskell is an experienced audio pr...,888,[],"['Feed post number 1\nRobert-Eric Gaskell, Ph....","[""Feed post number 1\nRobert-Eric Gaskell, Ph....","[{'title': 'Co-Founder, Inventor, VP Product',...","[{'degree': 'PhD, Sound Recording', 'instituti...",2016,...,"['Which is it? ""100% pure graphene"" or multi l...",1,[1825],[48],[12],[1],"Robert-Eric Gaskell, Ph.D.","[""Thank you Robert-Eric Gaskell, Ph.D. and Ari...","['likes this', 'likes this', 'likes this', 'li...","[21, 30, 30, 150, 300, 300, 300, 330, 330, 330..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13127,Art Health Solutions,phill-bell-9a784065,As Chief Executive Officer and co-founder of A...,2557,[],[],[],"[{'title': 'Chief Executive Officer', 'company...",[{'degree': 'Executive Post Graduate Diploma i...,2018,...,[],0,[],[],[],[],,[],[],[]
13128,Art Health Solutions,scot-forshaw-2859b12,I have worked at the cutting edge of software ...,891,[],[],[],"[{'title': 'fCTO', 'company_name': 'ART Health...",[{'degree': 'City and Guilds of London Institu...,2018,...,[],0,[],[],[],[],,[],[],[]
13129,Art Health Solutions,elliecaley,"I believe that an organisation’s performance, ...",2042,[],[],[],[{'title': 'Senior Workplace and Wellbeing Con...,[{'degree': 'Master of Science - MSc (Hons) Oc...,2018,...,[],0,[],[],[],[],,[],[],[]
13130,TEDU,nicolobates,With a background in finance and economics foc...,1722,[],[],[],"[{'title': 'Founder, Chief Executive Officer',...",[{'degree': 'Bachelor of Business Administrati...,2018,...,[],0,[],[],[],[],,[],[],[]


In [18]:
final_combined_df = final_combined_df[['Organization Name', 'username', 'about', 'followers',
       'posts', 'comments', 'reactions', 'experience', 'education',
       'Founded Date', 'equity_raised_in_5_years', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Equity_raised_until_now',
       'Success_until_now', 'cleaned_posts', 'num_posts', 'num_reposted_posts',
       'post_times', 'engagements_posts', 'comments_on_posts',
       'reposts_number_posts', 'cleaned_comments', 'num_comments',
       'comment_days_ago', 'engagements_comments', 'comments_on_comments',
       'reposts_number_comments', 'account_name', 'cleaned_reactions',
       'reaction_types', 'reaction_times']]

In [19]:
final_combined_df

Unnamed: 0,Organization Name,username,about,followers,posts,comments,reactions,experience,education,Founded Date,...,cleaned_comments,num_comments,comment_days_ago,engagements_comments,comments_on_comments,reposts_number_comments,account_name,cleaned_reactions,reaction_types,reaction_times
0,Netagenda;https://www.crunchbase.com/organizat...,melihvatansever,,5093,[Feed post number 1\nMelih Vatansever\nMelih V...,[Feed post number 1\nMelih Vatansever replied ...,[Feed post number 1\nMelih Vatansever likes th...,"[{'title': 'Partner', 'company_name': 'VNTRS G...","[{'degree': 'M Sc BA, Business Administration'...",2014-12-01,...,[Congrats Dimitri Nabatov - Lead gen has begun...,12,"[180, 180, 180, 180, 180, 180, 210, 210, 210, ...","[133, 133, 329, 18, 329, 115, 19, 19]","[32, 32, 11, 3, 62, 16, 5, 5]","[0, 0, 1, 1, 0, 1, 2, 2]",Melih Vatansever,[NORMAL für Rolf Hänggi:\nMaking things better...,"[likes this, finds this insightful, likes this...","[1, 730, 1095, 4, 5, 42, 49, 56, 63, 70, 77]"
1,Hideout;https://www.crunchbase.com/organizatio...,chefdaniellesobel,,72,[Feed post number 1\ndanielle sobel\ndanielle ...,[],[],"[{'title': 'Executive Chef', 'company_name': '...","[{'degree': 'W20', 'institution': 'Y Combinato...",2019-10-01,...,[],0,[],[],[],[],,[],[],[]
2,"Scaphold, Inc. (acquired by Amazon);https://ww...",vcning,,12296,[Feed post number 1\nVince C. Ning\nVince C. N...,[Feed post number 1\nVince C.’s profile photo\...,[Feed post number 1\nVince C.’s profile photo\...,"[{'title': 'Founder & CEO', 'company_name': 'N...","[{'degree': 'Public Leadership, Public Policy'...",2016-05-01,...,[Congrats Cory! It’s great to see Eaze continu...,9,"[0.5416666666666666, 7, 30, 150, 210, 240, 240...","[229, 10, 233, 215, 68, 122, 71, 253, 82]","[54, 0, 63, 43, 17, 14, 13, 23, 25]","[8, 38, 4, 0, 0, 1, 0, 0, 0]",Vince C. Ning,[Eaze Inc. is announcing a $10 million Series ...,"[supports this, supports this, likes this, lik...","[365, 2, 1095, 1460, 35, 42, 210, 56, 3285, 70..."
3,Engine eCommerce;https://www.crunchbase.com/or...,bpuryear,"As a resourceful and driven professional, I ha...",2532,[Feed post number 1\nBlake Puryear reposted th...,[Feed post number 1\nBlake Puryear commented o...,[],[{'title': 'Director of Product: Merchant & Sh...,"[{'degree': 'B.S. Computer Science, Computer S...",2016-01-01,...,"[Relentless optimism -- often, stuff doesn't w...",9,"[28, 28, 28, 30, 60, 90, 90, 120, 120]","[35, 45, 28, 43, 65, 43, 30, 236, 339]","[23, 14, 10, 14, 28, 3, 8, 53, 133]","[0, 11, 0, 3, 3, 1, 0, 0, 0]",Blake Puryear,[],[],[]
4,PredictifyMe;https://www.crunchbase.com/organi...,zusmani,With over 10 years of experience in data scien...,169714,"[Feed post number 1\nZeeshan Usmani, Ph.D\nZee...","[Feed post number 1\nZeeshan Usmani, Ph.D repl...",[],"[{'title': 'Co-Founder & CEO', 'company_name':...","[{'degree': 'PhD, Computer Science', 'institut...",2014-05-01,...,[Zeeshan. You don't have to address him as H.E...,17,"[2, 2, 2, 2, 7, 7, 14, 14, 14, 14, 14, 21, 21,...","[611, 587, 641, 55, 70, 308, 258, 115, 562, 2,...","[16, 164, 284, 6, 10, 109, 91, 9, 27, 0, 194, ...","[0, 2, 0, 0, 4, 0, 1, 1, 5, 46, 0, 1, 34, 0]","Zeeshan Usmani, Ph.D",[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2770,Particle Code;https://www.crunchbase.com/organ...,galia-benartzi-1a0aa220,"I am a technology entrepreneur, currently work...",5108,[],[Feed post number 1\nGalia Benartzi commented ...,[Feed post number 1\nGalia Benartzi likes this...,"[{'title': 'Co-Founder, Business Development',...","[{'degree': '', 'institution': 'Johns Hopkins ...",2009-01-01,...,[Been working with Itay for years and happy to...,2,"[1460, 1825]","[50, 7]","[11, 2]","[16, 0]",Galia Benartzi,[Super excited to finally announce that I've j...,"[likes this, likes this, likes this, likes thi...","[30, 730, 1095, 1460, 1825, 3285, 3650, 4015, ..."
2771,Particle Code;https://www.crunchbase.com/organ...,yotamshacham,Game developer who become a technology entrepr...,1564,[Feed post number 1\nYotam Shacham\nYotam Shac...,[Feed post number 1\nYotam Shacham replied to ...,[Feed post number 1\nYotam Shacham likes this\...,"[{'title': 'Chief Technology Officer', 'compan...","[{'degree': ""Bachelor's degree, Computer Scien...",2009-01-01,...,[Yuyi Kitano Lum is an email deliverability ex...,11,"[240, 240, 240, 240, 240, 240, 365, 365, 365, ...","[38, 38, 100, 93, 87, 98, 173, 148, 49]","[7, 7, 10, 12, 7, 20, 34, 25, 5]","[0, 0, 0, 0, 0, 0, 3, 0, 0]",Yotam Shacham,[🚀 Revolutionizing Medical Malpractice Insuran...,"[likes this, likes this, likes this, likes thi...","[30, 730, 90, 120, 150, 180, 210, 240, 270, 30..."
2772,Particle Code;https://www.crunchbase.com/organ...,yudi-levi-4bb91911,,4533,[Feed post number 1\nYudi Levi\nYudi Levi\n • ...,[],[Feed post number 1\nYudi Levi likes this\nJon...,"[{'title': 'Chief Architect', 'company_name': ...",[],2009-01-01,...,[],0,[],[],[],[],,[Calcalist כלכליסט picked up my LinkedIn post ...,[],[365]
2773,myDocket;https://www.crunchbase.com/organizati...,jasonwesbecher,Twenty three-year career as metrics-driven sal...,3743,[Feed post number 1\nJason Wesbecher\nJason We...,[Feed post number 1\nJason Wesbecher commented...,[Feed post number 1\nJason Wesbecher likes thi...,"[{'title': 'Chief Executive Officer', 'company...","[{'degree': 'Economics, The Wharton School of ...",2012-06-01,...,[I miss you Gio!],1,[365],"[169, 150, 261, 11, 168, 22, 174, 222]","[41, 63, 32, 13, 74, 5, 55, 61]","[2, 0, 1, 1, 1, 0, 0, 0]",Jason Wesbecher,[My sales team invited me to three sales calls...,"[likes this, likes this, likes this, likes thi...","[7, 730, 21, 120, 150, 180, 210, 240, 270, 365..."


In [20]:
final_combined_df = pd.concat([final_combined_df_first_half, final_combined_df], axis=0, ignore_index=True)


In [21]:
final_combined_df

Unnamed: 0,Organization Name,username,about,followers,posts,comments,reactions,experience,education,Founded Date,...,cleaned_comments,num_comments,comment_days_ago,engagements_comments,comments_on_comments,reposts_number_comments,account_name,cleaned_reactions,reaction_types,reaction_times
0,ORA Graphene Audio Inc.,ari-pinkas-88913811,Ari Pinkas is Co-founder and VP Business Devel...,1543,[],[],[],"[{'title': 'Co-Founder', 'company_name': 'ORA ...","[{'degree': 'BComm, Marketing', 'institution':...",2016,...,[],0,[],[],[],[],,[],[],[]
1,ORA Graphene Audio Inc.,kaiwen-hu-ph-d-a32a0946,- PhD in Materials Engineering. Extensive Expe...,725,"['Feed post number 1\nKaiwen Hu, Ph.D\nKaiwen ...","['Feed post number 1\nKaiwen Hu, Ph.D commente...",[],"[{'title': 'Co-founder VP Research', 'company_...","[{'degree': 'Doctor of Philosophy (Ph.D.), Mat...",2016,...,['Best of luck Rune!'],1,[365],[82],[19],[0],"Kaiwen Hu, Ph.D",[],[],[]
2,ORA Graphene Audio Inc.,michaelkraft,CEO / President / Independent Board Director /...,3729,['Feed post number 1\nMichael Kraft\nMichael K...,"[""Feed post number 1\nMichael Kraft replied to...",['Feed post number 1\nMichael Kraft likes this...,"[{'title': 'Chief Executive Officer', 'company...","[{'degree': 'ExecEd, Technology Marketing & St...",2016,...,['CONGRATS on the CEO role! We may need to tal...,10,"[90, 90, 90, 90, 90, 90, 90, 90, 90, 90]","[28, 71, 28, 68, 243, 85, 66, 108]","[21, 6, 21, 11, 116, 7, 61, 4]","[0, 1, 0, 8, 0, 3, 1, 2]",Michael Kraft,"['Over the past week, I\'ve heard the same thi...","['likes this', 'likes this', 'likes this', 'fi...","[7, 7, 60, 60, 120, 180, 300, 270, 300, 365, 3..."
3,ORA Graphene Audio Inc.,helgeseetzen,My journey took me from tech founder (BrightSi...,4925,['Feed post number 1\nHelge Seetzen\nHelge See...,['Feed post number 1\nHelge Seetzen replied to...,[],"[{'title': 'Managing Partner & CEO', 'company_...","[{'degree': 'PhD, Physics & Computer Science (...",2016,...,"['Congratulations! Bravo to the whole team!!',...",15,"[90, 90, 120, 120, 120, 120, 120, 120, 120, 12...","[465, 70, 465, 465, 465, 465, 465, 465]","[75, 5, 75, 75, 75, 75, 75, 75]","[7, 0, 7, 7, 7, 7, 7, 7]",Helge Seetzen,[],[],[]
4,ORA Graphene Audio Inc.,regaskell,Robert-Eric Gaskell is an experienced audio pr...,888,[],"['Feed post number 1\nRobert-Eric Gaskell, Ph....","[""Feed post number 1\nRobert-Eric Gaskell, Ph....","[{'title': 'Co-Founder, Inventor, VP Product',...","[{'degree': 'PhD, Sound Recording', 'instituti...",2016,...,"['Which is it? ""100% pure graphene"" or multi l...",1,[1825],[48],[12],[1],"Robert-Eric Gaskell, Ph.D.","[""Thank you Robert-Eric Gaskell, Ph.D. and Ari...","['likes this', 'likes this', 'likes this', 'li...","[21, 30, 30, 150, 300, 300, 300, 330, 330, 330..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15902,Particle Code;https://www.crunchbase.com/organ...,galia-benartzi-1a0aa220,"I am a technology entrepreneur, currently work...",5108,[],[Feed post number 1\nGalia Benartzi commented ...,[Feed post number 1\nGalia Benartzi likes this...,"[{'title': 'Co-Founder, Business Development',...","[{'degree': '', 'institution': 'Johns Hopkins ...",2009-01-01,...,[Been working with Itay for years and happy to...,2,"[1460, 1825]","[50, 7]","[11, 2]","[16, 0]",Galia Benartzi,[Super excited to finally announce that I've j...,"[likes this, likes this, likes this, likes thi...","[30, 730, 1095, 1460, 1825, 3285, 3650, 4015, ..."
15903,Particle Code;https://www.crunchbase.com/organ...,yotamshacham,Game developer who become a technology entrepr...,1564,[Feed post number 1\nYotam Shacham\nYotam Shac...,[Feed post number 1\nYotam Shacham replied to ...,[Feed post number 1\nYotam Shacham likes this\...,"[{'title': 'Chief Technology Officer', 'compan...","[{'degree': ""Bachelor's degree, Computer Scien...",2009-01-01,...,[Yuyi Kitano Lum is an email deliverability ex...,11,"[240, 240, 240, 240, 240, 240, 365, 365, 365, ...","[38, 38, 100, 93, 87, 98, 173, 148, 49]","[7, 7, 10, 12, 7, 20, 34, 25, 5]","[0, 0, 0, 0, 0, 0, 3, 0, 0]",Yotam Shacham,[🚀 Revolutionizing Medical Malpractice Insuran...,"[likes this, likes this, likes this, likes thi...","[30, 730, 90, 120, 150, 180, 210, 240, 270, 30..."
15904,Particle Code;https://www.crunchbase.com/organ...,yudi-levi-4bb91911,,4533,[Feed post number 1\nYudi Levi\nYudi Levi\n • ...,[],[Feed post number 1\nYudi Levi likes this\nJon...,"[{'title': 'Chief Architect', 'company_name': ...",[],2009-01-01,...,[],0,[],[],[],[],,[Calcalist כלכליסט picked up my LinkedIn post ...,[],[365]
15905,myDocket;https://www.crunchbase.com/organizati...,jasonwesbecher,Twenty three-year career as metrics-driven sal...,3743,[Feed post number 1\nJason Wesbecher\nJason We...,[Feed post number 1\nJason Wesbecher commented...,[Feed post number 1\nJason Wesbecher likes thi...,"[{'title': 'Chief Executive Officer', 'company...","[{'degree': 'Economics, The Wharton School of ...",2012-06-01,...,[I miss you Gio!],1,[365],"[169, 150, 261, 11, 168, 22, 174, 222]","[41, 63, 32, 13, 74, 5, 55, 61]","[2, 0, 1, 1, 1, 0, 0, 0]",Jason Wesbecher,[My sales team invited me to three sales calls...,"[likes this, likes this, likes this, likes thi...","[7, 730, 21, 120, 150, 180, 210, 240, 270, 365..."


In [30]:
# Ensure the "Success_after_5_years" column is of type string before saving
final_combined_df['Success_after_5_years'] = final_combined_df['Success_after_5_years'].astype(str)

# Save the DataFrame to a CSV file
# The file will be saved in the specified directory with the name 'final_combined_df_first_half3.csv'
final_combined_df.to_csv(r"..\Final_Company_Dataset\final_combined_df_first_half3.csv", index=False)



In [29]:
# Filter rows where 'Success_after_5_years' is equal to "-1"
filtered_data = final_combined_df[
    final_combined_df['Success_after_5_years'] == "-1"
].copy()  # Create a copy to avoid modifying the original DataFrame

# Further filter rows where 'cleaned_posts' is not empty
# Ensure 'cleaned_posts' is a list and has at least one element
filtered_data = filtered_data[
    filtered_data['cleaned_posts'].apply(lambda x: isinstance(x, list) and len(x) > 0)
]

# Display the filtered data with relevant columns
filtered_data[['Success_after_5_years', 'cleaned_posts']]



Unnamed: 0,Success_after_5_years,cleaned_posts
13132,-1,[We are beyond proud of you! 🐥🤵🏻‍♂️👏🏼👏🏼\nkreis...
13133,-1,[☺️\n+4]
13134,-1,[This has been a long time coming.\n\nI’m very...
13135,-1,[Incredibly excited about this new product lin...
13136,-1,[Nice meeting Ambassador of Pakistan to the U....
...,...,...
15897,-1,[We're \nhashtag\n#hiring. Know anyone who mig...
15900,-1,[I am going to say something to female founder...
15901,-1,[I have been working with the Kaddie team on t...
15905,-1,[Had a fantastic time at my 4th Big Exit event...


# Stop Here go to next notebook: Add_Features-final