# Add_Features-start:

the format for the scraped data had some slide changes. That is why this notebook has some variations. This is the first notebook. With open company data. The remaining open company numerical feature out of comments, posts, and reactions are extracted in the last notebook: Add_Features-final. The closed companies are teated in the notebook Add_Feautures-start-closed

# Extracting features out of Posts, Comments, and Reactions

In [None]:
import pandas as pd
import ast
import re

In [None]:
final_combined_df = pd.read_csv(r"..\Final_Company_Dataset\filtered_open_data_df.csv")

In [None]:
final_combined_df["Success_after_5_years"].unique()

array([0, 1], dtype=int64)

In [None]:
# Columns to convert from strings to lists
columns_to_convert = ["posts", "comments", "reactions"]

for column in columns_to_convert:
    final_combined_df[column] = final_combined_df[column].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
    )

## 1.Extracting Numerical Features and extracting Posts, Comments, and Reactions Texts

### Ectracting features out of Posts

In [None]:
# Function to extract the number of reactions, comments, and reposts from a post
def extract_post_details(post):
    """
    This function parses a text snippet representing a post and extracts three key details:
    1. Number of main reactions (e.g., likes or equivalent).
    2. Number of comments.
    3. Number of reposts.

    The function uses regular expressions to locate and extract these numbers.
    """

    # Extract the main number of reactions (e.g., likes or equivalent)
    # Matches a pattern where a number is surrounded by newline characters (\n) (e.g., "\n108\n").
    main_number_match = re.search(r"\n(\d+)\n", post)
    main_number = int(main_number_match.group(1)) if main_number_match else 0  # Default to 0 if no match is found.

    # Extract the number of comments
    # Matches the pattern where the main number is followed by a newline and a number for comments.
    # Example pattern: "108\n5 comments"
    comments_match = re.search(rf"{re.escape(str(main_number))}\n(\d+)\scomments", post, re.IGNORECASE)
    comments_number = int(comments_match.group(1)) if comments_match else 0  # Default to 0 if no match is found.

    # Extract the number of reposts
    # Matches the pattern where the word "comments" is followed by a newline and a number for reposts.
    # Example pattern: "5 comments\n1 repost"
    reposts_match = re.search(r"comments\n(\d+)\sreposts?", post, re.IGNORECASE)
    reposts_number = int(reposts_match.group(1)) if reposts_match else 0  # Default to 0 if no match is found.

    # Return the extracted details as a tuple
    return main_number, comments_number, reposts_number



In [None]:
# Function to convert time strings (e.g., "2 years ago") into the equivalent number of days
def time_to_days(time_str):
    """
    Converts a time string like "2 years", "5 months", etc., into an integer representing days.
    Defaults to 1 day for hours or returns None for unrecognized formats.
    """
    time_str = time_str.strip().lower()
    if "year" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 365
    elif "month" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 30
    elif "week" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 7
    elif "day" in time_str:
        return int(re.search(r"\d+", time_str).group())
    elif "hour" in time_str:
        return 1  # Consider any "hour" reference as 1 day
    else:
        return None  # Return None for unrecognized time formats

# Function to clean posts and extract details like time and repost count
def clean_posts(posts):
    """
    Cleans the posts, counts reposted posts, and extracts time information in days.
    - Removes reposted posts.
    - Extracts time details based on 'ago' and converts them to days.
    - Cleans content text for further processing.
    """
    if not isinstance(posts, list):  # Ensure input is a list
        return [], 0, []

    cleaned_posts = []
    repost_count = 0  # Counter for reposted posts
    post_times = []  # List to store time information in days

    for post in posts:
        if isinstance(post, str) and "reposted this" in post.lower():
            repost_count += 1  # Increment repost counter
            continue  # Skip processing reposted posts

        # Extract time strings (e.g., "2 weeks ago") and convert to days
        if isinstance(post, str):
            time_match = re.search(r"(\d+\s*(?:year|month|week|day)s?)\s*ago\nfollow\n", post, re.IGNORECASE)
            if time_match:
                time_in_days = time_to_days(time_match.group(1))
                post_times.append(time_in_days)
            else:
                post_times.append(None)  # Append None if no time is found

            # Clean the post content starting after the "Follow" section
            match = re.search(r"follow\n(.*)", post, re.IGNORECASE | re.DOTALL)
            if match:
                content = match.group(1)

                # Remove specific markers and text patterns
                content = re.split(r"(\n…more|\nActivate to view larger image|\nDetails|\nNABIS|• 3rd+|\nChris Nguyen)",
                                   content, flags=re.IGNORECASE)[0]
                content = re.sub(r"…more", "", content).strip()  # Clean trailing markers
                cleaned_posts.append(content)

    return cleaned_posts, repost_count, post_times

# Function to extract engagement (likes), comments, and repost counts
def extract_post_details(post):
    """
    Extracts engagement (main number), comments, and repost counts from a post string.
    """
    if not isinstance(post, str):
        return 0, 0, 0  # Default to 0 if input is invalid

    # Extract engagement numbers (e.g., likes/reactions)
    main_number_match = re.search(r"\n(\d+)\n", post)
    main_number = int(main_number_match.group(1)) if main_number_match else 0

    # Extract comment counts
    comments_match = re.search(r"(\d+)\scomments", post, re.IGNORECASE)
    comments_number = int(comments_match.group(1)) if comments_match else 0

    # Extract repost counts
    reposts_match = re.search(r"(\d+)\sreposts?", post, re.IGNORECASE)
    reposts_number = int(reposts_match.group(1)) if reposts_match else 0

    return main_number, comments_number, reposts_number

# Main function to process the DataFrame and add new columns
def process_dataframe(df):
    """
    Processes the DataFrame to clean posts, extract engagement metrics, and add calculated data.
    - Adds cleaned post content.
    - Extracts engagements, comments, and reposts into separate columns.
    - Converts time data into days.
    """
    cleaned_data = []  # Store intermediate results
    engagements_posts = []
    comments_on_posts = []
    reposts_number_posts = []
    post_times_column = []

    for _, row in df.iterrows():
        posts = row.get("posts", [])  # Access the "posts" column

        # Clean posts and count reposted posts
        cleaned_posts, repost_count, post_times = clean_posts(posts)

        # Extract engagement metrics
        engagements = []
        comments = []
        reposts = []
        if isinstance(posts, list):
            for post in posts:
                main_number, comments_number, reposts_number = extract_post_details(post)
                engagements.append(main_number)
                comments.append(comments_number)
                reposts.append(reposts_number)

        engagements_posts.append(engagements)
        comments_on_posts.append(comments)
        reposts_number_posts.append(reposts)
        post_times_column.append(post_times)

        cleaned_data.append({
            "cleaned_posts": cleaned_posts,
            "num_posts": len(cleaned_posts),
            "num_reposted_posts": repost_count
        })

    # Create a new DataFrame with additional columns for cleaned data
    cleaned_df = pd.DataFrame(cleaned_data)
    cleaned_df["engagements_posts"] = engagements_posts
    cleaned_df["comments_on_posts"] = comments_on_posts
    cleaned_df["reposts_number_posts"] = reposts_number_posts
    cleaned_df["post_times"] = post_times_column

    # Combine the original DataFrame with the cleaned data
    return pd.concat([df, cleaned_df], axis=1)

# Example usage
try:
    # Assuming `final_combined_df` is your original DataFrame
    final_combined_df = process_dataframe(final_combined_df)
    print(final_combined_df.head())  # Display the updated DataFrame
except Exception as e:
    print(f"Error: {e}")


   Unnamed: 0        Organization Name                 username  \
0           0  ORA Graphene Audio Inc.      ari-pinkas-88913811   
1           1  ORA Graphene Audio Inc.  kaiwen-hu-ph-d-a32a0946   
2           2  ORA Graphene Audio Inc.             michaelkraft   
3           3  ORA Graphene Audio Inc.             helgeseetzen   
4           4  ORA Graphene Audio Inc.                regaskell   

                                               about        followers  \
0  Ari Pinkas is Co-founder and VP Business Devel...  1,543 followers   
1  - PhD in Materials Engineering. Extensive Expe...    725 followers   
2  CEO / President / Independent Board Director /...  3,729 followers   
3  My journey took me from tech founder (BrightSi...  4,925 followers   
4  Robert-Eric Gaskell is an experienced audio pr...    888 followers   

                                               posts  \
0                                                 []   
1  [Feed post number 1\nKaiwen Hu, Ph.D\nKaiwe

Unnamed: 0.1,Unnamed: 0,Organization Name,username,about,followers,posts,comments,reactions,experience,education,...,equity_rounds_raised_after_5_years,Equity_raised_until_now,Success_until_now,cleaned_posts,num_posts,num_reposted_posts,engagements_posts,comments_on_posts,reposts_number_posts,post_times
0,0,ORA Graphene Audio Inc.,ari-pinkas-88913811,Ari Pinkas is Co-founder and VP Business Devel...,"1,543 followers",[],[],[],"[{'title': 'Co-Founder', 'company_name': 'ORA ...","[{'degree': 'BComm, Marketing', 'institution':...",...,1,0.0,0,[],0,0,[],[],[],[]
1,1,ORA Graphene Audio Inc.,kaiwen-hu-ph-d-a32a0946,- PhD in Materials Engineering. Extensive Expe...,725 followers,"[Feed post number 1\nKaiwen Hu, Ph.D\nKaiwen H...","[Feed post number 1\nKaiwen Hu, Ph.D commented...",[],"[{'title': 'Co-founder VP Research', 'company_...","[{'degree': 'Doctor of Philosophy (Ph.D.), Mat...",...,1,0.0,0,"[The SCI Competition, a leading pitch event th...",4,2,"[13, 98, 33, 0, 11, 0]","[0, 2, 3, 0, 0, 0]","[1, 5, 3, 0, 0, 0]","[60, 2555, 2555, 2555]"
2,2,ORA Graphene Audio Inc.,michaelkraft,CEO / President / Independent Board Director /...,"3,729 followers",[Feed post number 1\nMichael Kraft\nMichael Kr...,[Feed post number 1\nMichael Kraft replied to ...,[Feed post number 1\nMichael Kraft likes this\...,"[{'title': 'Chief Executive Officer', 'company...","[{'degree': 'ExecEd, Technology Marketing & St...",...,1,0.0,0,[I just registered for CES 2025 Jan 7-10. I ho...,15,0,"[0, 0, 88, 16, 0, 0, 5, 0, 0, 127, 4, 0, 0, 6, 0]","[0, 0, 43, 0, 0, 0, 0, 0, 0, 49, 0, 0, 0, 3, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[60, 60, 240, 365, 365, 365, 365, 365, 730, 73..."
3,3,ORA Graphene Audio Inc.,helgeseetzen,My journey took me from tech founder (BrightSi...,"4,925 followers",[Feed post number 1\nHelge Seetzen\nHelge Seet...,[Feed post number 1\nHelge Seetzen replied to ...,[],"[{'title': 'Managing Partner & CEO', 'company_...","[{'degree': 'PhD, Physics & Computer Science (...",...,1,0.0,0,[I am delighted to share the first closing of ...,13,0,"[465, 199, 0, 97, 164, 14, 169, 158, 20, 16, 2...","[75, 9, 0, 13, 16, 0, 6, 7, 0, 2, 0, 0, 0]","[7, 24, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0]","[120, 730, 1095, 1095, 1095, 1825, 1825, 2190,..."
4,4,ORA Graphene Audio Inc.,regaskell,Robert-Eric Gaskell is an experienced audio pr...,888 followers,[],"[Feed post number 1\nRobert-Eric Gaskell, Ph.D...","[Feed post number 1\nRobert-Eric Gaskell, Ph.D...","[{'title': 'Co-Founder, Inventor, VP Product',...","[{'degree': 'PhD, Sound Recording', 'instituti...",...,1,0.0,0,[],0,0,[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13127,15053,Art Health Solutions,phill-bell-9a784065,As Chief Executive Officer and co-founder of A...,"2,557 followers",[],[],[],"[{'title': 'Chief Executive Officer', 'company...",[{'degree': 'Executive Post Graduate Diploma i...,...,0,0.0,0,[],0,0,[],[],[],[]
13128,15054,Art Health Solutions,scot-forshaw-2859b12,I have worked at the cutting edge of software ...,891 followers,[],[],[],"[{'title': 'fCTO', 'company_name': 'ART Health...",[{'degree': 'City and Guilds of London Institu...,...,0,0.0,0,[],0,0,[],[],[],[]
13129,15055,Art Health Solutions,elliecaley,"I believe that an organisation’s performance, ...","2,042 followers",[],[],[],[{'title': 'Senior Workplace and Wellbeing Con...,[{'degree': 'Master of Science - MSc (Hons) Oc...,...,0,0.0,0,[],0,0,[],[],[],[]
13130,15056,TEDU,nicolobates,With a background in finance and economics foc...,"1,722 followers",[],[],[],"[{'title': 'Founder, Chief Executive Officer',...",[{'degree': 'Bachelor of Business Administrati...,...,0,0.0,0,[],0,0,[],[],[],[]


In [None]:
print(final_combined_df["num_posts"].head())
print(final_combined_df["num_reposted_posts"].head())
print(final_combined_df.dtypes)


0     0
1     4
2    15
3    13
4     0
Name: num_posts, dtype: int64
0    0
1    2
2    0
3    0
4    0
Name: num_reposted_posts, dtype: int64
Unnamed: 0                              int64
Organization Name                      object
username                               object
about                                  object
followers                              object
posts                                  object
comments                               object
reactions                              object
experience                             object
education                              object
Founded Date                            int64
equity_raised_in_5_years              float64
Success_after_5_years                   int64
equity_rounds_raised_after_5_years      int64
Equity_raised_until_now               float64
Success_until_now                       int64
cleaned_posts                          object
num_posts                               int64
num_reposted_posts          

### Extracting Features out of Comments

In [None]:
# Function to convert time strings like '6mo', '2d', etc., into days
def time_to_days(time_str):
    """
    Converts time strings into their equivalent days.
    Supported units: 'y' (year), 'mo' (month), 'w' (week), 'd' (day),
    'h' (hour), 'm' (minute), 's' (second).
    """
    if not time_str:
        return None
    time_map = {"y": 365, "mo": 30, "w": 7, "d": 1, "h": 1 / 24, "m": 1 / 1440, "s": 1 / 86400}
    match = re.match(r"(\d+)([a-z]+)", time_str.lower())
    if match:
        value, unit = match.groups()
        return int(value) * time_map.get(unit, 0)
    return None

# Function to extract comments and their times from a post
def extract_comments(post, account_name):
    """
    Extracts comments and the times (in days) they were posted for a given account name.
    """
    comments = []
    times_in_days = []

    patterns = [
        rf"{re.escape(account_name)}(?:\s+\w+)?\n\s*•\s*3rd\+\n.*?\b(\d+[a-z]+)\b\n(.+?)(?:\n(?:Like|Reply|Collapse replies|Load more comments))",
        rf"{re.escape(account_name)}(?:,\s*[\w\.]+)*\s*\n\s*Author\n.*?\b(\d+[a-z]+)\b\n(.+?)(?:\n(?:Like|Reply|Collapse replies|Load more comments))",
    ]

    for pattern in patterns:
        matches = re.finditer(pattern, post, re.IGNORECASE | re.DOTALL)

        for match in matches:
            time_str = match.group(1)
            time_in_days = time_to_days(time_str)
            if time_in_days is not None:
                times_in_days.append(time_in_days)

            comment = match.group(2).strip()
            comments.append(comment)

    return comments, times_in_days

# Function to extract the account name from the post
def extract_account_name(post):
    """
    Extracts the account name from the post based on common patterns.
    """
    match = re.search(r"s profile photo\n(.*?) commented on this", post, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    else:
        match = re.search(r"Feed post number \d+\n(.*?)(?:\sreplied|\scommented)", post, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    return None

# Process a list of comments to extract cleaned content and times
def process_comments_list(comments_list):
    """
    Processes a list of comments, extracting cleaned comments and posting times.
    """
    cleaned_comments = []
    days_ago_list = []
    account_name = None

    for post in comments_list:
        if not post:
            continue

        if account_name is None:
            account_name = extract_account_name(post)

        if account_name:
            comments, times_in_days = extract_comments(post, account_name)
            cleaned_comments.extend(comments)
            days_ago_list.extend(times_in_days)
    return cleaned_comments, days_ago_list, account_name

# Wrapper function to process each row of the DataFrame for comments
def process_comments(row):
    """
    Processes a row from the DataFrame, extracting and cleaning comments.
    """
    try:
        comments_list = row["comments"]
        if isinstance(comments_list, list):
            return process_comments_list(comments_list)
        return [], [], None
    except Exception:
        return [], [], None

# Add cleaned comments, comment counts, times, and account names to the DataFrame
def process_row(row):
    """
    Processes a row to add detailed comment information.
    """
    cleaned_comments, days_ago, account_name = process_comments(row)
    return pd.Series({
        "cleaned_comments": cleaned_comments,
        "num_comments": len(cleaned_comments),
        "comment_days_ago": days_ago,
        "account_name": account_name
    })

# Extract details about post engagements
def extract_post_details(post):
    """
    Extracts engagement (likes), comment counts, and repost counts from a post.
    """
    main_number_match = re.search(r"\n(\d+)\n", post)
    main_number = int(main_number_match.group(1)) if main_number_match else 0

    comments_match = re.search(r"(\d+)\scomments", post, re.IGNORECASE)
    comments_number = int(comments_match.group(1)) if comments_match else 0

    reposts_match = re.search(r"(\d+)\sreposts?", post, re.IGNORECASE)
    reposts_number = int(reposts_match.group(1)) if reposts_match else 0

    return main_number, comments_number, reposts_number

# Process engagement details for each post
def process_post_details(row):
    """
    Processes posts to extract engagement, comment, and repost data.
    """
    try:
        comments_list = row["comments"]
        if isinstance(comments_list, list):
            engagements, comments_on_comments, reposts_number = zip(
                *(extract_post_details(post) for post in comments_list)
            )
            return pd.Series({
                "engagements_comments": list(engagements),
                "comments_on_comments": list(comments_on_comments),
                "reposts_number_comments": list(reposts_number),
            })
        return pd.Series({
            "engagements_comments": [],
            "comments_on_comments": [],
            "reposts_number_comments": [],
        })
    except Exception:
        return pd.Series({
            "engagements_comments": [],
            "comments_on_comments": [],
            "reposts_number_comments": [],
        })

# Apply processing functions to the DataFrame
final_combined_df[["cleaned_comments", "num_comments", "comment_days_ago", "account_name"]] = final_combined_df.apply(process_row, axis=1)
final_combined_df[["engagements_comments", "comments_on_comments", "reposts_number_comments"]] = final_combined_df.apply(process_post_details, axis=1)

# Display the resulting DataFrame
final_combined_df


Unnamed: 0.1,Unnamed: 0,Organization Name,username,about,followers,posts,comments,reactions,experience,education,...,comments_on_posts,reposts_number_posts,post_times,cleaned_comments,num_comments,comment_days_ago,account_name,engagements_comments,comments_on_comments,reposts_number_comments
0,0,ORA Graphene Audio Inc.,ari-pinkas-88913811,Ari Pinkas is Co-founder and VP Business Devel...,"1,543 followers",[],[],[],"[{'title': 'Co-Founder', 'company_name': 'ORA ...","[{'degree': 'BComm, Marketing', 'institution':...",...,[],[],[],[],0,[],,[],[],[]
1,1,ORA Graphene Audio Inc.,kaiwen-hu-ph-d-a32a0946,- PhD in Materials Engineering. Extensive Expe...,725 followers,"[Feed post number 1\nKaiwen Hu, Ph.D\nKaiwen H...","[Feed post number 1\nKaiwen Hu, Ph.D commented...",[],"[{'title': 'Co-founder VP Research', 'company_...","[{'degree': 'Doctor of Philosophy (Ph.D.), Mat...",...,"[0, 2, 3, 0, 0, 0]","[1, 5, 3, 0, 0, 0]","[60, 2555, 2555, 2555]",[Best of luck Rune!],1,[365],"Kaiwen Hu, Ph.D",[82],[19],[0]
2,2,ORA Graphene Audio Inc.,michaelkraft,CEO / President / Independent Board Director /...,"3,729 followers",[Feed post number 1\nMichael Kraft\nMichael Kr...,[Feed post number 1\nMichael Kraft replied to ...,[Feed post number 1\nMichael Kraft likes this\...,"[{'title': 'Chief Executive Officer', 'company...","[{'degree': 'ExecEd, Technology Marketing & St...",...,"[0, 0, 43, 0, 0, 0, 0, 0, 0, 49, 0, 0, 0, 3, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[60, 60, 240, 365, 365, 365, 365, 365, 730, 73...",[CONGRATS on the CEO role! We may need to talk...,10,"[90, 90, 90, 90, 90, 90, 90, 90, 90, 90]",Michael Kraft,"[28, 71, 28, 68, 243, 85, 66, 108]","[21, 6, 21, 11, 116, 7, 61, 4]","[0, 1, 0, 8, 0, 3, 1, 2]"
3,3,ORA Graphene Audio Inc.,helgeseetzen,My journey took me from tech founder (BrightSi...,"4,925 followers",[Feed post number 1\nHelge Seetzen\nHelge Seet...,[Feed post number 1\nHelge Seetzen replied to ...,[],"[{'title': 'Managing Partner & CEO', 'company_...","[{'degree': 'PhD, Physics & Computer Science (...",...,"[75, 9, 0, 13, 16, 0, 6, 7, 0, 2, 0, 0, 0]","[7, 24, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0]","[120, 730, 1095, 1095, 1095, 1825, 1825, 2190,...","[Congratulations! Bravo to the whole team!!, T...",15,"[90, 90, 120, 120, 120, 120, 120, 120, 120, 12...",Helge Seetzen,"[465, 70, 465, 465, 465, 465, 465, 465]","[75, 5, 75, 75, 75, 75, 75, 75]","[7, 0, 7, 7, 7, 7, 7, 7]"
4,4,ORA Graphene Audio Inc.,regaskell,Robert-Eric Gaskell is an experienced audio pr...,888 followers,[],"[Feed post number 1\nRobert-Eric Gaskell, Ph.D...","[Feed post number 1\nRobert-Eric Gaskell, Ph.D...","[{'title': 'Co-Founder, Inventor, VP Product',...","[{'degree': 'PhD, Sound Recording', 'instituti...",...,[],[],[],"[Which is it? ""100% pure graphene"" or multi la...",1,[1825],"Robert-Eric Gaskell, Ph.D.",[48],[12],[1]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13127,15053,Art Health Solutions,phill-bell-9a784065,As Chief Executive Officer and co-founder of A...,"2,557 followers",[],[],[],"[{'title': 'Chief Executive Officer', 'company...",[{'degree': 'Executive Post Graduate Diploma i...,...,[],[],[],[],0,[],,[],[],[]
13128,15054,Art Health Solutions,scot-forshaw-2859b12,I have worked at the cutting edge of software ...,891 followers,[],[],[],"[{'title': 'fCTO', 'company_name': 'ART Health...",[{'degree': 'City and Guilds of London Institu...,...,[],[],[],[],0,[],,[],[],[]
13129,15055,Art Health Solutions,elliecaley,"I believe that an organisation’s performance, ...","2,042 followers",[],[],[],[{'title': 'Senior Workplace and Wellbeing Con...,[{'degree': 'Master of Science - MSc (Hons) Oc...,...,[],[],[],[],0,[],,[],[],[]
13130,15056,TEDU,nicolobates,With a background in finance and economics foc...,"1,722 followers",[],[],[],"[{'title': 'Founder, Chief Executive Officer',...",[{'degree': 'Bachelor of Business Administrati...,...,[],[],[],[],0,[],,[],[],[]


### Extract Features from Reactions

In [None]:
# Function to convert time strings to days
def time_to_days(time_str):
    """
    Converts time strings like '1 year', '5 days', or '2 hours' into days.
    """
    time_str = time_str.strip().lower()
    if "year" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 365
    elif "month" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 30
    elif "week" in time_str:
        return int(re.search(r"\d+", time_str).group()) * 7
    elif "day" in time_str:
        return int(re.search(r"\d+", time_str).group())
    elif "hour" in time_str or "minute" in time_str or "second" in time_str:
        return 1  # Less than a day is considered as 1 day
    return None

# Function to extract reaction type for a given account name
def extract_reaction(post, account_name):
    """
    Extracts the reaction type (e.g., 'likes this', 'finds this insightful') from a post.
    """
    if not post or not account_name:
        return None
    match = re.search(rf"{re.escape(account_name)}\s+(likes this|finds this insightful|supports this|comments on this)",
                      post, re.IGNORECASE)
    return match.group(1).strip() if match else None

# Function to extract the post text
def extract_post_text(post):
    """
    Extracts post text between '\nFollow\n' and either '\n…more' or '\nhashtag'.
    """
    if not post:
        return None
    match = re.search(r"\nFollow\n(.*?)(?:\n…more|\nhashtag)", post, re.DOTALL)
    return match.group(1).strip() if match else None

# Function to extract the time in days
def extract_time_in_days(post):
    """
    Extracts the time in days based on text patterns like '2 days ago'.
    """
    if not post:
        return None
    match = re.search(r"•?\s*\n\s*(\d+\s*[a-z]+)\s+ago\s+Follow\n", post, re.IGNORECASE)
    if match:
        time_str = match.group(1).strip()
        return time_to_days(time_str)
    return None

# Function to extract hashtags from the post
def extract_hashtags(post):
    """
    Extracts hashtags from the post content.
    """
    if not post:
        return []
    match = re.search(r"\nhashtag\n(#[^\n]+)", post, re.DOTALL)
    if match:
        hashtags = match.group(1).split("\n")
        return [tag.lstrip("#") for tag in hashtags if tag.startswith("#")]
    return []

# Process each row to extract and clean reactions
def process_row(row):
    """
    Processes a row to clean reactions, extract reaction types, hashtags, and times.
    """
    reactions_list = row.get("reactions", [])
    account_name = row.get("account_name")

    cleaned_reactions = []
    reaction_types = []
    hashtag_lists = []
    post_times = []

    for reaction_post in reactions_list:
        if not reaction_post:
            continue
        post_text = extract_post_text(reaction_post)
        reaction_type = extract_reaction(reaction_post, account_name)
        hashtags = extract_hashtags(reaction_post)
        time_in_days = extract_time_in_days(reaction_post)

        if post_text:
            cleaned_reactions.append(post_text)
        if reaction_type:
            reaction_types.append(reaction_type)
        hashtag_lists.append(hashtags)
        if time_in_days is not None:
            post_times.append(time_in_days)

    return pd.Series({
        "cleaned_reactions": cleaned_reactions,
        "reaction_types": reaction_types,
        "hashtag_lists": hashtag_lists,
        "num_posts": len(reactions_list),
        "reaction_times": post_times
    })

# Extract details about engagements, comments, and reposts
def extract_post_details(post):
    """
    Extracts engagement metrics (reactions, comments, reposts) from a post.
    """
    main_number_match = re.search(r"\n(\d+)\n", post)
    main_number = int(main_number_match.group(1)) if main_number_match else 0

    comments_match = re.search(r"(\d+)\scomments", post, re.IGNORECASE)
    comments_number = int(comments_match.group(1)) if comments_match else 0

    reposts_match = re.search(r"(\d+)\sreposts?", post, re.IGNORECASE)
    reposts_number = int(reposts_match.group(1)) if reposts_match else 0

    return main_number, comments_number, reposts_number

# Process reactions to create detailed engagement columns
def process_reactions_details(row):
    """
    Processes reactions to extract engagement, comment, and repost data.
    """
    reactions_list = row.get("reactions", [])
    if isinstance(reactions_list, list):
        try:
            engagements, comments, reposts = zip(
                *(extract_post_details(post) for post in reactions_list)
            )
            return pd.Series({
                "engagements_reactions": list(engagements),
                "comments_on_reactions": list(comments),
                "reposts_number_reactions": list(reposts),
            })
        except ValueError:
            pass
    return pd.Series({
        "engagements_reactions": [],
        "comments_on_reactions": [],
        "reposts_number_reactions": [],
    })

# Apply the processing functions to the DataFrame
temp_result = final_combined_df.apply(process_row, axis=1)
final_combined_df["cleaned_reactions"] = temp_result["cleaned_reactions"]
final_combined_df["reaction_types"] = temp_result["reaction_types"]
final_combined_df["hashtag_lists"] = temp_result["hashtag_lists"]
final_combined_df["num_posts"] = temp_result["num_posts"]
final_combined_df["reaction_times"] = temp_result["reaction_times"]

# Add engagement details to the DataFrame
reactions_details = final_combined_df.apply(process_reactions_details, axis=1)
final_combined_df["engagements_reactions"] = reactions_details["engagements_reactions"]
final_combined_df["comments_on_reactions"] = reactions_details["comments_on_reactions"]
final_combined_df["reposts_number_reactions"] = reactions_details["reposts_number_reactions"]

# Display the updated DataFrame
final_combined_df


TypeError: 'float' object is not iterable

In [None]:
# Process each row for reactions and related details
def process_row(row):
    """
    Processes a DataFrame row to extract cleaned reactions, reaction types, hashtags, and post times.
    """
    # Extract reactions and account name
    reactions_list = row.get("reactions", [])
    account_name = row.get("account_name")

    # Initialize outputs
    cleaned_reactions = []
    reaction_types = []
    hashtag_lists = []
    post_times = []

    # Ensure reactions_list is iterable
    if not isinstance(reactions_list, list):
        reactions_list = []

    # Process each reaction
    for reaction_post in reactions_list:
        if not isinstance(reaction_post, str):  # Skip invalid posts
            continue

        # Extract details
        post_text = extract_post_text(reaction_post)
        reaction_type = extract_reaction(reaction_post, account_name)
        hashtags = extract_hashtags(reaction_post)
        time_in_days = extract_time_in_days(reaction_post)

        # Append to respective lists
        if post_text:
            cleaned_reactions.append(post_text)
        if reaction_type:
            reaction_types.append(reaction_type)
        hashtag_lists.append(hashtags)
        if time_in_days is not None:
            post_times.append(time_in_days)

    # Return a series of processed details
    return pd.Series({
        "cleaned_reactions": cleaned_reactions,
        "reaction_types": reaction_types,
        "hashtag_lists": hashtag_lists,
        "new_num_posts": len(reactions_list),
        "new_post_times": post_times
    })

# Process reactions for engagement details
def process_reactions_details(row):
    """
    Processes reactions to extract engagement metrics (reactions, comments, reposts).
    """
    try:
        reactions_list = row.get("reactions", [])
        if not isinstance(reactions_list, list):
            reactions_list = []

        # Extract engagement details for valid posts
        engagements, comments, reposts = zip(
            *(extract_post_details(post) for post in reactions_list if isinstance(post, str))
        ) if reactions_list else ([], [], [])

        # Return extracted details
        return pd.Series({
            "engagements_reactions": list(engagements),
            "comments_on_reactions": list(comments),
            "reposts_number_reactions": list(reposts),
        })
    except Exception as e:
        # Return empty lists on error
        return pd.Series({
            "engagements_reactions": [],
            "comments_on_reactions": [],
            "reposts_number_reactions": [],
        })

# Apply processing functions to the DataFrame
temp_result = final_combined_df.apply(process_row, axis=1)
final_combined_df["cleaned_reactions"] = temp_result["cleaned_reactions"]
final_combined_df["reaction_types"] = temp_result["reaction_types"]
final_combined_df["hashtag_lists"] = temp_result["hashtag_lists"]
final_combined_df["num_posts"] = temp_result["new_num_posts"]
final_combined_df["reaction_times"] = temp_result["new_post_times"]

# Add engagement metrics to the DataFrame
reactions_details = final_combined_df.apply(process_reactions_details, axis=1)
final_combined_df["engagements_reactions"] = reactions_details["engagements_reactions"]
final_combined_df["comments_on_reactions"] = reactions_details["comments_on_reactions"]
final_combined_df["reposts_number_reactions"] = reactions_details["reposts_number_reactions"]

# Display the updated DataFrame
final_combined_df


Unnamed: 0.1,Unnamed: 0,Organization Name,username,about,followers,posts,comments,reactions,experience,education,...,engagements_comments,comments_on_comments,reposts_number_comments,cleaned_reactions,reaction_types,hashtag_lists,reaction_times,engagements_reactions,comments_on_reactions,reposts_number_reactions
0,0,ORA Graphene Audio Inc.,ari-pinkas-88913811,Ari Pinkas is Co-founder and VP Business Devel...,"1,543 followers",[],[],[],"[{'title': 'Co-Founder', 'company_name': 'ORA ...","[{'degree': 'BComm, Marketing', 'institution':...",...,[],[],[],[],[],[],[],[],[],[]
1,1,ORA Graphene Audio Inc.,kaiwen-hu-ph-d-a32a0946,- PhD in Materials Engineering. Extensive Expe...,725 followers,"[Feed post number 1\nKaiwen Hu, Ph.D\nKaiwen H...","[Feed post number 1\nKaiwen Hu, Ph.D commented...",[],"[{'title': 'Co-founder VP Research', 'company_...","[{'degree': 'Doctor of Philosophy (Ph.D.), Mat...",...,[82],[19],[0],[],[],[],[],[],[],[]
2,2,ORA Graphene Audio Inc.,michaelkraft,CEO / President / Independent Board Director /...,"3,729 followers",[Feed post number 1\nMichael Kraft\nMichael Kr...,[Feed post number 1\nMichael Kraft replied to ...,[Feed post number 1\nMichael Kraft likes this\...,"[{'title': 'Chief Executive Officer', 'company...","[{'degree': 'ExecEd, Technology Marketing & St...",...,"[28, 71, 28, 68, 243, 85, 66, 108]","[21, 6, 21, 11, 116, 7, 61, 4]","[0, 1, 0, 8, 0, 3, 1, 2]","[Over the past week, I've heard the same thing...","[likes this, likes this, likes this, finds thi...","[[], [], [], [], [], [], [], [GivingCandyToStr...","[7, 7, 60, 60, 120, 180, 300, 270, 300, 365, 3...","[0, 106, 80, 988, 558, 56, 210, 46, 0, 0, 98, 0]","[0, 10, 15, 215, 10, 6, 10, 0, 132, 0, 14, 0]","[222, 27, 0, 156, 19, 1, 5, 0, 284, 0, 1, 0]"
3,3,ORA Graphene Audio Inc.,helgeseetzen,My journey took me from tech founder (BrightSi...,"4,925 followers",[Feed post number 1\nHelge Seetzen\nHelge Seet...,[Feed post number 1\nHelge Seetzen replied to ...,[],"[{'title': 'Managing Partner & CEO', 'company_...","[{'degree': 'PhD, Physics & Computer Science (...",...,"[465, 70, 465, 465, 465, 465, 465, 465]","[75, 5, 75, 75, 75, 75, 75, 75]","[7, 0, 7, 7, 7, 7, 7, 7]",[],[],[],[],[],[],[]
4,4,ORA Graphene Audio Inc.,regaskell,Robert-Eric Gaskell is an experienced audio pr...,888 followers,[],"[Feed post number 1\nRobert-Eric Gaskell, Ph.D...","[Feed post number 1\nRobert-Eric Gaskell, Ph.D...","[{'title': 'Co-Founder, Inventor, VP Product',...","[{'degree': 'PhD, Sound Recording', 'instituti...",...,[48],[12],[1],"[Thank you Robert-Eric Gaskell, Ph.D. and Ari ...","[likes this, likes this, likes this, likes thi...","[[], [], [], [], [], [], [CES2024 it's clear w...","[21, 30, 30, 150, 300, 300, 300, 330, 330, 330...","[0, 162, 120, 69, 78, 169, 172, 26, 0, 31, 90,...","[0, 17, 15, 22, 12, 30, 6, 4, 0, 0, 0, 0, 0]","[0, 6, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13127,15053,Art Health Solutions,phill-bell-9a784065,As Chief Executive Officer and co-founder of A...,"2,557 followers",[],[],[],"[{'title': 'Chief Executive Officer', 'company...",[{'degree': 'Executive Post Graduate Diploma i...,...,[],[],[],[],[],[],[],[],[],[]
13128,15054,Art Health Solutions,scot-forshaw-2859b12,I have worked at the cutting edge of software ...,891 followers,[],[],[],"[{'title': 'fCTO', 'company_name': 'ART Health...",[{'degree': 'City and Guilds of London Institu...,...,[],[],[],[],[],[],[],[],[],[]
13129,15055,Art Health Solutions,elliecaley,"I believe that an organisation’s performance, ...","2,042 followers",[],[],[],[{'title': 'Senior Workplace and Wellbeing Con...,[{'degree': 'Master of Science - MSc (Hons) Oc...,...,[],[],[],[],[],[],[],[],[],[]
13130,15056,TEDU,nicolobates,With a background in finance and economics foc...,"1,722 followers",[],[],[],"[{'title': 'Founder, Chief Executive Officer',...",[{'degree': 'Bachelor of Business Administrati...,...,[],[],[],[],[],[],[],[],[],[]


In [None]:
final_combined_df.at[2, "reactions"][2]

'Feed post number 3\nMichael Kraft celebrates this\nMichelle Rucci\nMichelle Rucci\n • 3rd+\n • 3rd+\nOffice Manager at ORA Graphene Audio\nOffice Manager at ORA Graphene Audio\n2mo • \n 2 months ago\nFollow\nI’m happy to share that I’m starting a new position as Office Manager at ORA Graphene Audio!\nStarting a New Position\n80\n15 comments'

In [None]:
#final_combined_df.at[2, "Linkedin_url"]

In [None]:
final_combined_df.columns

Index(['Unnamed: 0', 'Organization Name', 'username', 'about', 'followers',
       'posts', 'comments', 'reactions', 'experience', 'education',
       'Founded Date', 'equity_raised_in_5_years', 'Success_after_5_years',
       'equity_rounds_raised_after_5_years', 'Equity_raised_until_now',
       'Success_until_now', 'cleaned_posts', 'num_posts', 'num_reposted_posts',
       'engagements_posts', 'comments_on_posts', 'reposts_number_posts',
       'post_times', 'cleaned_comments', 'num_comments', 'comment_days_ago',
       'account_name', 'engagements_comments', 'comments_on_comments',
       'reposts_number_comments', 'cleaned_reactions', 'reaction_types',
       'hashtag_lists', 'reaction_times', 'engagements_reactions',
       'comments_on_reactions', 'reposts_number_reactions'],
      dtype='object')

In [None]:
final_combined_df[['Organization Name']].nunique()

Organization Name    3089
dtype: int64

### Extract Numnber of Followers

In [None]:
# Remove the "followers" text and commas, then convert to integers
final_combined_df['followers'] = final_combined_df['followers'] \
    .str.replace('followers', '', regex=False) \
    .str.replace(',', '', regex=False) \
    .astype(int)

# Verify the conversion
final_combined_df["Success_after_5_years"].unique()


array([0, 1], dtype=int64)

In [None]:
final_combined_df.to_csv(r"..\Final_Company_Dataset\final_combined_df_first_half1.csv")

# Stop Here go to next notebook: Add_Features-start-closed