### Import libraries

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Load the upvote/downvote data
file_path = r"C:\Users\ASUS\OneDrive\Desktop\Greedy Game\Task 4\4_voting_data.csv"
df = pd.read_csv(file_path)

### Exploratory Data Analysis

In [4]:
# Basic data exploration
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())

Dataset shape: (1048575, 8)

Column names:
['adv_id', 'offer_id', 'app_id', 'state', 'comment', 'ip', 'country_code', 'created_at']


In [5]:
# Display first few rows
df.head()

Unnamed: 0,adv_id,offer_id,app_id,state,comment,ip,country_code,created_at
0,5ba99349-0e56-4a15-96d8-6708f21ae548,254610,rupiyo,UP,,152.58.197.244,IN,2024-10-17 08:38:41.178855+00:00
1,d016c006-f927-48bc-987d-40a2f3f4648b,258019,rupiyo,UP,,152.59.191.187,IN,2024-10-20 03:50:17.578827+00:00
2,d016c006-f927-48bc-987d-40a2f3f4648b,258634,rupiyo,UP,,152.58.129.31,IN,2024-10-21 15:52:12.762049+00:00
3,ed359404-3d8a-4f3e-9d4e-38e6ef575254,258019,rupiyo,UP,,152.59.191.187,IN,2024-10-20 03:50:17.578827+00:00
4,ed359404-3d8a-4f3e-9d4e-38e6ef575254,258634,rupiyo,UP,,152.58.129.31,IN,2024-10-21 15:52:12.762049+00:00


In [6]:
# Check data types and basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 8 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   adv_id        1048504 non-null  object
 1   offer_id      1048575 non-null  int64 
 2   app_id        1048575 non-null  object
 3   state         1048575 non-null  object
 4   comment       46254 non-null    object
 5   ip            1048575 non-null  object
 6   country_code  1048575 non-null  object
 7   created_at    1048575 non-null  object
dtypes: int64(1), object(7)
memory usage: 64.0+ MB


In [7]:
# Basic statistics
df.describe()

Unnamed: 0,offer_id
count,1048575.0
mean,221808.1
std,77693.66
min,5272.0
25%,248702.0
50%,257219.0
75%,257907.0
max,261755.0


In [8]:
# Convert created_at to datetime
df['created_at'] = pd.to_datetime(df['created_at'], format='mixed')

In [9]:
# Check state distribution (UP vs DOWN)
print("Vote distribution:")
vote_counts = df['state'].value_counts()
print(vote_counts)
print(f"\nUpvote percentage: {vote_counts.get('UP', 0) / len(df) * 100:.2f}%")
print(f"Downvote percentage: {vote_counts.get('DOWN', 0) / len(df) * 100:.2f}%")

Vote distribution:
state
UP      1002321
DOWN      46254
Name: count, dtype: int64

Upvote percentage: 95.59%
Downvote percentage: 4.41%


In [10]:
# Overall sentiment ratio
total_votes = len(df)
upvotes = len(df[df['state'] == 'UP'])
downvotes = len(df[df['state'] == 'DOWN'])
sentiment_ratio = upvotes / downvotes if downvotes > 0 else float('inf')

print(f"\nOverall Sentiment Analysis:")
print(f"Total votes: {total_votes}")
print(f"Upvotes: {upvotes}")
print(f"Downvotes: {downvotes}")
print(f"Sentiment ratio (Up/Down): {sentiment_ratio:.2f}")


Overall Sentiment Analysis:
Total votes: 1048575
Upvotes: 1002321
Downvotes: 46254
Sentiment ratio (Up/Down): 21.67


In [11]:
# Country-wise analysis
country_stats = df.groupby('country_code').agg({
    'state': ['count', lambda x: sum(x == 'UP'), lambda x: sum(x == 'DOWN')],
    'adv_id': 'nunique',
    'offer_id': 'nunique'
}).round(2)

In [12]:
# Flatten column names for country stats
country_stats.columns = ['total_votes', 'upvotes', 'downvotes', 'unique_users', 'unique_offers']
country_stats = country_stats.reset_index()

In [13]:
# Calculate country-wise metrics
country_stats['upvote_rate'] = (country_stats['upvotes'] / country_stats['total_votes'] * 100).round(2)
country_stats['downvote_rate'] = (country_stats['downvotes'] / country_stats['total_votes'] * 100).round(2)
country_stats['sentiment_ratio'] = (country_stats['upvotes'] / country_stats['downvotes']).round(2)

In [14]:
# Display top countries by vote volume
print("Top 10 countries by vote volume:")
top_countries = country_stats.nlargest(10, 'total_votes')
print(top_countries[['country_code', 'total_votes', 'upvote_rate', 'unique_users', 'unique_offers']])

Top 10 countries by vote volume:
   country_code  total_votes  upvote_rate  unique_users  unique_offers
43           IN      1028879        95.59        166783            542
7            BD         5896        98.91          5578             89
0            AE         5451        99.56          5257            134
78           PH         1486        91.12           813            187
73           NP          927        82.52           472             85
41           ID          898        87.86           561            151
79           PK          704        95.45           514             88
58           MA          270        93.70           127             59
25           DZ          263        93.92           139             51
67           MX          218        97.25           133             58


In [15]:
# Offer-wise sentiment analysis
offer_stats = df.groupby('offer_id').agg({
    'state': ['count', lambda x: sum(x == 'UP'), lambda x: sum(x == 'DOWN')],
    'adv_id': 'nunique',
    'app_id': 'nunique',
    'country_code': 'nunique'
}).round(2)

In [16]:
# Flatten offer stats columns
offer_stats.columns = ['total_votes', 'upvotes', 'downvotes', 'unique_users', 'unique_apps', 'countries']
offer_stats = offer_stats.reset_index()

In [17]:
# Calculate offer-wise metrics
offer_stats['upvote_rate'] = (offer_stats['upvotes'] / offer_stats['total_votes'] * 100).round(2)
offer_stats['engagement_score'] = offer_stats['total_votes'] / offer_stats['unique_users']
offer_stats['sentiment_ratio'] = (offer_stats['upvotes'] / offer_stats['downvotes']).round(2)

In [18]:
# Identify best and worst performing offers
best_offers = offer_stats.nlargest(10, 'upvote_rate')
worst_offers = offer_stats.nsmallest(10, 'upvote_rate')

print("Top 10 best performing offers (by upvote rate):")
print(best_offers[['offer_id', 'total_votes', 'upvote_rate', 'unique_users']])

print("\nTop 10 worst performing offers (by upvote rate):")
print(worst_offers[['offer_id', 'total_votes', 'upvote_rate', 'unique_users']])

Top 10 best performing offers (by upvote rate):
    offer_id  total_votes  upvote_rate  unique_users
0       5272            1        100.0             1
1       5273            9        100.0             9
2       6086         5157        100.0          5157
5       8561            1        100.0             1
7      10353            1        100.0             1
9      11741            1        100.0             1
14     13031            2        100.0             2
16     13566            1        100.0             1
17     13599            1        100.0             1
19     13758            1        100.0             1

Top 10 worst performing offers (by upvote rate):
     offer_id  total_votes  upvote_rate  unique_users
4        7908            1          0.0             1
13      12912            1          0.0             1
108     28490            1          0.0             1
156     49946            1          0.0             1
157     49948            1          0.0          

In [19]:
# App-wise analysis
app_stats = df.groupby('app_id').agg({
    'state': ['count', lambda x: sum(x == 'UP'), lambda x: sum(x == 'DOWN')],
    'adv_id': 'nunique',
    'offer_id': 'nunique'
}).round(2)

In [20]:
# Flatten app stats columns
app_stats.columns = ['total_votes', 'upvotes', 'downvotes', 'unique_users', 'unique_offers']
app_stats = app_stats.reset_index()
app_stats['upvote_rate'] = (app_stats['upvotes'] / app_stats['total_votes'] * 100).round(2)

In [21]:
# Display app performance
print("App performance analysis:")
print(app_stats.sort_values('total_votes', ascending=False)[['app_id', 'total_votes', 'upvote_rate', 'unique_offers']])

App performance analysis:
             app_id  total_votes  upvote_rate  unique_offers
19   offerwall_1072       340186        98.48             65
144   offerwall_916       241800       100.00             25
55     offerwall_31        55937        90.19            404
110    offerwall_59        53577        99.46             31
46    offerwall_260        39783       100.00              7
..              ...          ...          ...            ...
98    offerwall_542            1       100.00              1
120   offerwall_664            1       100.00              1
127   offerwall_740            1       100.00              1
124   offerwall_708            1         0.00              1
146   offerwall_939            1       100.00              1

[155 rows x 4 columns]


In [22]:
# User behavior analysis
user_stats = df.groupby('adv_id').agg({
    'state': ['count', lambda x: sum(x == 'UP'), lambda x: sum(x == 'DOWN')],
    'offer_id': 'nunique',
    'app_id': 'nunique',
    'country_code': 'first'
}).round(2)

In [23]:
# Flatten user stats columns
user_stats.columns = ['total_votes', 'upvotes', 'downvotes', 'unique_offers', 'unique_apps', 'country']
user_stats = user_stats.reset_index()

In [24]:
# Calculate user engagement metrics
user_stats['upvote_rate'] = (user_stats['upvotes'] / user_stats['total_votes'] * 100).round(2)
user_stats['user_type'] = user_stats['upvote_rate'].apply(
    lambda x: 'Positive' if x >= 70 else 'Negative' if x <= 30 else 'Neutral'
)

In [25]:
# User type distribution
print("User sentiment distribution:")
user_type_dist = user_stats['user_type'].value_counts()
print(user_type_dist)
print(f"\nPercentage breakdown:")
for user_type, count in user_type_dist.items():
    print(f"{user_type}: {count/len(user_stats)*100:.2f}%")

User sentiment distribution:
user_type
Positive    154540
Neutral       9693
Negative      7846
Name: count, dtype: int64

Percentage breakdown:
Positive: 89.81%
Neutral: 5.63%
Negative: 4.56%


In [26]:
# Time-based analysis
df['date'] = df['created_at'].dt.date
df['hour'] = df['created_at'].dt.hour
df['day_of_week'] = df['created_at'].dt.dayofweek
df['month'] = df['created_at'].dt.month

In [27]:
# Daily sentiment trends
daily_sentiment = df.groupby('date').agg({
    'state': ['count', lambda x: sum(x == 'UP'), lambda x: sum(x == 'DOWN')]
}).round(2)

daily_sentiment.columns = ['total_votes', 'upvotes', 'downvotes']
daily_sentiment = daily_sentiment.reset_index()
daily_sentiment['upvote_rate'] = (daily_sentiment['upvotes'] / daily_sentiment['total_votes'] * 100).round(2)

In [28]:
# Weekly patterns
weekly_pattern = df.groupby('day_of_week').agg({
    'state': ['count', lambda x: sum(x == 'UP') / len(x) * 100]
}).round(2)

weekly_pattern.columns = ['total_votes', 'upvote_rate']
weekly_pattern = weekly_pattern.reset_index()
weekly_pattern['day_name'] = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

print("Weekly voting patterns:")
print(weekly_pattern[['day_name', 'total_votes', 'upvote_rate']])

Weekly voting patterns:
  day_name  total_votes  upvote_rate
0      Mon        69613        95.32
1      Tue        82756        95.39
2      Wed       306418        96.57
3      Thu       253129        95.14
4      Fri       101624        92.80
5      Sat       124326        96.24
6      Sun       110709        96.06


In [29]:
# Hourly patterns
hourly_pattern = df.groupby('hour').agg({
    'state': ['count', lambda x: sum(x == 'UP') / len(x) * 100]
}).round(2)

hourly_pattern.columns = ['total_votes', 'upvote_rate']
hourly_pattern = hourly_pattern.reset_index()

print("\nPeak voting hours:")
peak_hours = hourly_pattern.nlargest(5, 'total_votes')
print(peak_hours)


Peak voting hours:
    hour  total_votes  upvote_rate
3      3       112084        97.52
8      8        94259        97.61
11    11        71333        97.69
13    13        65279        93.92
6      6        63675        95.17


In [30]:
# Comment length analysis (if comments exist)
if 'comment' in df.columns:
    df['comment_length'] = df['comment'].astype(str).str.len()
    
    comment_sentiment = df.groupby('state')['comment_length'].agg(['mean', 'median', 'std']).round(2)
    print("Comment length by sentiment:")
    print(comment_sentiment)

Comment length by sentiment:
        mean  median    std
state                      
DOWN   74.24    47.0  58.68
UP      3.00     3.0   0.00


In [31]:
# Geographic insights
geo_insights = country_stats.copy()
geo_insights['engagement_level'] = geo_insights['total_votes'].apply(
    lambda x: 'High' if x > geo_insights['total_votes'].quantile(0.75) else 
             'Medium' if x > geo_insights['total_votes'].quantile(0.25) else 'Low'
)

print("\nGeographic engagement levels:")
print(geo_insights['engagement_level'].value_counts())


Geographic engagement levels:
engagement_level
Medium    57
Low       31
High      29
Name: count, dtype: int64


In [32]:
# Offer quality scoring
offer_quality = offer_stats.copy()
offer_quality['quality_score'] = (
    (offer_quality['upvote_rate'] / 100) * 0.6 +
    (offer_quality['total_votes'] / offer_quality['total_votes'].max()) * 0.3 +
    (offer_quality['unique_users'] / offer_quality['unique_users'].max()) * 0.1
) * 100

offer_quality['quality_grade'] = offer_quality['quality_score'].apply(
    lambda x: 'A' if x >= 80 else 'B' if x >= 60 else 'C' if x >= 40 else 'D'
)

In [33]:
# Quality grade distribution
print("Offer quality distribution:")
quality_dist = offer_quality['quality_grade'].value_counts()
print(quality_dist)

Offer quality distribution:
quality_grade
B    581
C    393
D     73
A      3
Name: count, dtype: int64


In [34]:
# Cross-platform analysis
cross_platform = df.groupby(['app_id', 'country_code']).agg({
    'state': ['count', lambda x: sum(x == 'UP') / len(x) * 100],
    'adv_id': 'nunique'
}).round(2)

cross_platform.columns = ['total_votes', 'upvote_rate', 'unique_users']
cross_platform = cross_platform.reset_index()

In [35]:
# Identify regional preferences
regional_prefs = cross_platform.groupby('country_code').agg({
    'upvote_rate': 'mean',
    'total_votes': 'sum',
    'unique_users': 'sum'
}).round(2)

print("Regional voting preferences:")
print(regional_prefs.sort_values('upvote_rate', ascending=False).head(10))

Regional voting preferences:
              upvote_rate  total_votes  unique_users
country_code                                        
AF                  100.0            5             4
AM                  100.0            1             1
BG                  100.0            4             3
BA                  100.0            1             1
AZ                  100.0            7             6
CR                  100.0           21            13
CM                  100.0            1             1
CY                  100.0            1             1
BH                  100.0            1             1
BO                  100.0          180            63


In [36]:
# User engagement depth analysis
engagement_depth = user_stats.copy()
engagement_depth['engagement_depth'] = engagement_depth['unique_offers'].apply(
    lambda x: 'Deep' if x >= 5 else 'Medium' if x >= 3 else 'Shallow'
)

engagement_sentiment = engagement_depth.groupby('engagement_depth')['upvote_rate'].agg(['mean', 'count']).round(2)
print("\nEngagement depth vs sentiment:")
print(engagement_sentiment)


Engagement depth vs sentiment:
                   mean   count
engagement_depth               
Deep              94.49   34377
Medium            90.26   16940
Shallow           91.86  120762


In [37]:
# Voting velocity analysis
df['voting_velocity'] = df.groupby('adv_id')['created_at'].diff().dt.total_seconds() / 3600
velocity_sentiment = df.groupby('state')['voting_velocity'].agg(['mean', 'median']).round(2)

print("\nVoting velocity by sentiment (hours between votes):")
print(velocity_sentiment)


Voting velocity by sentiment (hours between votes):
       mean  median
state              
DOWN   0.93    0.14
UP     2.00    0.73


In [38]:
# Seasonal trends
df['month_name'] = df['created_at'].dt.strftime('%B')
monthly_trends = df.groupby('month_name').agg({
    'state': ['count', lambda x: sum(x == 'UP') / len(x) * 100]
}).round(2)

monthly_trends.columns = ['total_votes', 'upvote_rate']
print("\nMonthly voting trends:")
print(monthly_trends)


Monthly voting trends:
            total_votes  upvote_rate
month_name                          
October         1048575        95.59


In [39]:
# Key insights summary
print("\n" + "-"*60)
print("Main takeaways")
print("-"*60)

insights = {
    'Overall Sentiment': f"{upvotes/(upvotes+downvotes)*100:.1f}% positive",
    'Most Engaged Country': top_countries.iloc[0]['country_code'],
    'Best Performing Offer': best_offers.iloc[0]['offer_id'],
    'User Satisfaction': f"{len(user_stats[user_stats['user_type']=='Positive'])/len(user_stats)*100:.1f}% positive users",
    'Peak Activity Day': weekly_pattern.loc[weekly_pattern['total_votes'].idxmax(), 'day_name'],
    'Quality Offers (A-grade)': f"{quality_dist.get('A', 0)} offers",
    'Average Engagement': f"{user_stats['unique_offers'].mean():.1f} offers per user"
}

for key, value in insights.items():
    print(f"{key}: {value}")


------------------------------------------------------------
Main takeaways
------------------------------------------------------------
Overall Sentiment: 95.6% positive
Most Engaged Country: IN
Best Performing Offer: 5272.0
User Satisfaction: 89.8% positive users
Peak Activity Day: Wed
Quality Offers (A-grade): 3 offers
Average Engagement: 5.9 offers per user


In [40]:
# Recommendations based on insights
print("\n" + "-"*60)
print("RECOMMENDATIONS")
print("-"*60)

recommendations = [
    "Focus on replicating success factors from A-grade offers",
    f"Optimize content for {top_countries.iloc[0]['country_code']} market (highest engagement)",
    "Investigate and improve D-grade offers to reduce negative sentiment",
    f"Leverage peak activity on {weekly_pattern.loc[weekly_pattern['total_votes'].idxmax(), 'day_name']}s for important launches",
    "Develop retention strategies for shallow engagement users",
    "Create region-specific content based on voting preferences"
]

for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")


------------------------------------------------------------
RECOMMENDATIONS
------------------------------------------------------------
1. Focus on replicating success factors from A-grade offers
2. Optimize content for IN market (highest engagement)
3. Investigate and improve D-grade offers to reduce negative sentiment
4. Leverage peak activity on Weds for important launches
5. Develop retention strategies for shallow engagement users
6. Create region-specific content based on voting preferences


In [41]:


# Final metrics dashboard
dashboard_metrics = {
    'Total Votes': f"{total_votes:,}",
    'Unique Users': f"{df['adv_id'].nunique():,}",
    'Unique Offers': f"{df['offer_id'].nunique():,}",
    'Countries': f"{df['country_code'].nunique()}",
    'Apps': f"{df['app_id'].nunique()}",
    'Avg Daily Votes': f"{daily_sentiment['total_votes'].mean():.0f}",
    'Sentiment Ratio': f"{sentiment_ratio:.2f}",
    'User Satisfaction': f"{(upvotes/total_votes*100):.1f}%"
}

print("\n" + "-"*60)
print("Key Performance Indicators to Monitor")
print("-"*60)

for metric, value in dashboard_metrics.items():
    print(f"{metric}: {value}")


------------------------------------------------------------
Key Performance Indicators to Monitor
------------------------------------------------------------
Total Votes: 1,048,575
Unique Users: 172,079
Unique Offers: 1,050
Countries: 117
Apps: 155
Avg Daily Votes: 131072
Sentiment Ratio: 21.67
User Satisfaction: 95.6%
