In [1]:
import pandas as pd

# Load your datasets
poll = pd.read_csv('president_approval.csv')
headlines = pd.read_csv('headlines_f.csv')

poll = poll.iloc[:, :2]

# Ensure the date columns are in the same format
poll['end_date'] = pd.to_datetime(poll['end_date'])
headlines['Date'] = pd.to_datetime(headlines['Date'])

# Perform the left join
merged_df = pd.merge(poll, headlines, left_on='end_date', right_on='Date', how='left')

# Remove rows where there's no matching date (i.e., where 'Headline' is null)
merged_df = merged_df.dropna(subset=['Headline'])

# If you want to keep only one date column, you can drop the redundant one
merged_df = merged_df.drop('Date', axis=1)

# Display the first few rows to verify
print(merged_df.head(25))

# Save the result if needed
merged_df.to_csv('merged_poll_headlines.csv', index=False)

       end_date   yes                                           Headline  \
3455 2022-02-25  41.0  Error: 429 Client Error: Too Many Requests for...   
3456 2022-02-25  41.0  Error: 429 Client Error: Too Many Requests for...   
3457 2022-02-25  41.0  Error: 429 Client Error: Too Many Requests for...   
3458 2022-02-25  41.0  Error: 429 Client Error: Too Many Requests for...   
3459 2022-02-25  41.0  Error: 429 Client Error: Too Many Requests for...   
3460 2022-02-25  41.0  Error: 429 Client Error: Too Many Requests for...   
3461 2022-02-25  41.0  Error: 429 Client Error: Too Many Requests for...   
3462 2022-02-25  41.0  Error: 429 Client Error: Too Many Requests for...   
3463 2022-02-25  41.0  Error: 429 Client Error: Too Many Requests for...   
3464 2022-02-25  41.0  Error: 429 Client Error: Too Many Requests for...   
3465 2022-02-25  38.9  Error: 429 Client Error: Too Many Requests for...   
3466 2022-02-25  38.9  Error: 429 Client Error: Too Many Requests for...   
3467 2022-02

In [2]:
# Group by 'end_date' and 'Headline' to ensure uniqueness
unique_merged_df = (
    merged_df
    .groupby(['end_date', 'Headline'], as_index=False)
    .agg({'yes': 'mean'})  # or any other aggregation function you need
)

# Display the first few rows to verify
print(unique_merged_df.head())


    end_date                                           Headline        yes
0 2021-01-21                  Biden Rolls Back the Trump Legacy  51.911667
1 2021-01-21  Charlottesville Inspired Biden to Run. Now It ...  51.911667
2 2021-01-21  Jen Psaki’s Debut: No Attacks, No Lectures, No...  51.911667
3 2021-01-21  The Three Types of Republicans Donald Trump Cr...  51.911667
4 2021-01-22                   Biden’s Virus Plans Meet Reality  56.000000


In [3]:
merged_df=unique_merged_df

In [4]:
unique_merged_df.shape

(810, 3)

In [5]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Function to get sentiment score using BERT
def get_sentiment_score(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    outputs = model(**inputs)
    scores = outputs[0][0].detach().numpy()
    return scores.argmax() + 1  # Returns a score from 1 (very negative) to 5 (very positive)

# Print column names and shape of the DataFrame
print("Columns in the DataFrame:", merged_df.columns)
print("Shape of the DataFrame:", merged_df.shape)

# Perform sentiment analysis
print("Calculating sentiment scores...")
merged_df['sentiment'] = merged_df['Headline'].apply(get_sentiment_score)
print("Sentiment scores calculated and added to the DataFrame.")

# Print updated column names and shape
print("Updated columns in the DataFrame:", merged_df.columns)
print("Updated shape of the DataFrame:", merged_df.shape)

# Convert date column to datetime
date_column = 'end_date'  # Make sure this matches your actual date column name
merged_df[date_column] = pd.to_datetime(merged_df[date_column])
merged_df.set_index(date_column, inplace=True)

print("Shape after setting index:", merged_df.shape)

# Aggregate sentiment scores and approval ratings (weekly)
weekly_sentiment = merged_df.resample('W')['sentiment'].mean()
weekly_approval = merged_df.resample('W')['yes'].mean()

print("Shape of weekly_sentiment:", weekly_sentiment.shape)
print("Shape of weekly_approval:", weekly_approval.shape)

# Combine sentiment and approval data
data = pd.concat([weekly_sentiment, weekly_approval], axis=1).dropna()

print("Shape of combined data:", data.shape)

# Create lagged features
for i in range(1, 3):  # Creating 4 weeks of lagged features
    data[f'sentiment_lag_{i}'] = data['sentiment'].shift(i)

data = data.dropna()

print("Shape of data after creating lagged features:", data.shape)

# If the dataset is not empty, proceed with modeling
if data.shape[0] > 0:
    # Prepare features and target
    X = data.drop('yes', axis=1)
    y = data['yes']

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a Random Forest model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared Score: {r2}")

    # Feature importance
    feature_importance = pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_})
    print(feature_importance.sort_values('importance', ascending=False))
else:
    print("The dataset is empty after processing.▌")

  from .autonotebook import tqdm as notebook_tqdm


Columns in the DataFrame: Index(['end_date', 'Headline', 'yes'], dtype='object')
Shape of the DataFrame: (810, 3)
Calculating sentiment scores...
Sentiment scores calculated and added to the DataFrame.
Updated columns in the DataFrame: Index(['end_date', 'Headline', 'yes', 'sentiment'], dtype='object')
Updated shape of the DataFrame: (810, 4)
Shape after setting index: (810, 3)
Shape of weekly_sentiment: (58,)
Shape of weekly_approval: (58,)
Shape of combined data: (58, 2)
Shape of data after creating lagged features: (56, 4)
Mean Squared Error: 18.965501260073992
R-squared Score: 0.14783450407941412
           feature  importance
1  sentiment_lag_1    0.370634
2  sentiment_lag_2    0.324555
0        sentiment    0.304811


In [5]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Function to get sentiment score using BERT
def get_sentiment_score(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    outputs = model(**inputs)
    scores = outputs[0][0].detach().numpy()
    return scores.argmax() + 1  # Returns a score from 1 (very negative) to 5 (very positive)

# Print column names and shape of the DataFrame
print("Columns in the DataFrame:", merged_df.columns)
print("Shape of the DataFrame:", merged_df.shape)

# Perform sentiment analysis
print("Calculating sentiment scores...")
merged_df['sentiment'] = merged_df['Headline'].apply(get_sentiment_score)
print("Sentiment scores calculated and added to the DataFrame.")

# Print updated column names and shape
print("Updated columns in the DataFrame:", merged_df.columns)
print("Updated shape of the DataFrame:", merged_df.shape)

# Convert date column to datetime
date_column = 'end_date'  # Make sure this matches your actual date column name
merged_df[date_column] = pd.to_datetime(merged_df[date_column])
merged_df.set_index(date_column, inplace=True)

print("Shape after setting index:", merged_df.shape)

# Aggregate sentiment scores and approval ratings (weekly)
weekly_sentiment = merged_df.resample('W')['sentiment'].mean()
weekly_approval = merged_df.resample('W')['yes'].mean()

print("Shape of weekly_sentiment:", weekly_sentiment.shape)
print("Shape of weekly_approval:", weekly_approval.shape)

# Combine sentiment and approval data
data = pd.concat([weekly_sentiment, weekly_approval], axis=1).dropna()

print("Shape of combined data:", data.shape)

# Create lagged features
for i in range(1, 2):  # Creating 4 weeks of lagged features
    data[f'sentiment_lag_{i}'] = data['sentiment'].shift(i)

data = data.dropna()

print("Shape of data after creating lagged features:", data.shape)

# If the dataset is not empty, proceed with modeling
if data.shape[0] > 0:
    # Prepare features and target
    X = data.drop('yes', axis=1)
    y = data['yes']

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a Random Forest model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared Score: {r2}")

    # Feature importance
    feature_importance = pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_})
    print(feature_importance.sort_values('importance', ascending=False))
else:
    print("The dataset is empty after processing.▌")

  from .autonotebook import tqdm as notebook_tqdm


Columns in the DataFrame: Index(['end_date', 'Headline', 'yes'], dtype='object')
Shape of the DataFrame: (810, 3)
Calculating sentiment scores...
Sentiment scores calculated and added to the DataFrame.
Updated columns in the DataFrame: Index(['end_date', 'Headline', 'yes', 'sentiment'], dtype='object')
Updated shape of the DataFrame: (810, 4)
Shape after setting index: (810, 3)
Shape of weekly_sentiment: (58,)
Shape of weekly_approval: (58,)
Shape of combined data: (58, 2)
Shape of data after creating lagged features: (57, 3)
Mean Squared Error: 18.54528347086191
R-squared Score: 0.11533356831674868
           feature  importance
0        sentiment     0.50129
1  sentiment_lag_1     0.49871


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming merged_df is your original dataframe
print("Original shape:", merged_df.shape)

# Create a date index (adjust the start date if needed)
merged_df = merged_df.reset_index(drop=True)
merged_df.index = pd.date_range(start='2021-01-01', periods=len(merged_df), freq='D')

# Create lagged features (only 1 day lag for this small dataset)
merged_df['sentiment_lag_1'] = merged_df['sentiment'].shift(1)

# Drop rows with NaN values
data = merged_df.dropna()

print("Shape after creating lag feature:", data.shape)

# Prepare features and target
X = data[['sentiment', 'sentiment_lag_1']]
y = data['yes']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

# Print coefficients
print("\nModel Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"{feature}: {coef}")

print(f"Intercept: {model.intercept_}")

# Print the first few rows of the final dataset
print("\nFirst few rows of the final dataset:")
print(data.head())

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Print column names and shape of the DataFrame
print("Columns in the DataFrame:", merged_df.columns)
print("Shape of the DataFrame:", merged_df.shape)

# Since we don't have a date column, we'll use the index as our date
# Assuming the DataFrame is already sorted by date
merged_df = merged_df.reset_index(drop=True)
merged_df.index = pd.date_range(start='2021-01-01', periods=len(merged_df), freq='D')

print("Shape after setting index:", merged_df.shape)

# Aggregate sentiment scores and approval ratings (weekly)
weekly_sentiment = merged_df.resample('W')['sentiment'].mean()
weekly_approval = merged_df.resample('W')['yes'].mean()

print("Shape of weekly_sentiment:", weekly_sentiment.shape)
print("Shape of weekly_approval:", weekly_approval.shape)

# Combine sentiment and approval data
data = pd.concat([weekly_sentiment, weekly_approval], axis=1).dropna()

print("Shape of combined data:", data.shape)

# Create lagged features
for i in range(1, 5):  # Creating 4 weeks of lagged features
    data[f'sentiment_lag_{i}'] = data['sentiment'].shift(i)

data = data.dropna()

print("Shape of data after creating lagged features:", data.shape)

# If the dataset is not empty, proceed with modeling
if data.shape[0] > 0:
    # Prepare features and target
    X = data.drop('yes', axis=1)
    y = data['yes']

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a Random Forest model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared Score: {r2}")

    # Feature importance
    feature_importance = pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_})
    print(feature_importance.sort_values('importance', ascending=False))
else:
    print("The dataset is empty after processing. Please check your data and aggregation steps.")

# Print the first few rows of the final dataset
print("\nFirst few rows of the final dataset:")
print(data.head())

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Print column names and shape of the DataFrame
print("Columns in the DataFrame:", merged_df.columns)
print("Shape of the DataFrame:", merged_df.shape)

# Convert end_date to datetime
merged_df['end_date'] = pd.to_datetime(merged_df['end_date'])
merged_df.set_index('end_date', inplace=True)

# Sort the DataFrame by date
merged_df = merged_df.sort_index()

print("Shape after setting index:", merged_df.shape)

# Create lagged feature (only 1 day lag for this small dataset)
merged_df['sentiment_lag_1'] = merged_df['sentiment'].shift(1)

# Drop rows with NaN values
data = merged_df.dropna()

print("Shape after creating lag feature:", data.shape)

# Prepare features and target
X = data[['sentiment', 'sentiment_lag_1']]
y = data['yes']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

# Print coefficients
print("\nModel Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"{feature}: {coef}")

print(f"Intercept: {model.intercept_}")

# Print the first few rows of the final dataset
print("\nFirst few rows of the final dataset:")
print(data.head())

# Print correlation matrix
print("\nCorrelation Matrix:")
print(data[['yes', 'sentiment', 'sentiment_lag_1']].corr())

Columns in the DataFrame: Index(['end_date', 'Headline', 'yes'], dtype='object')
Shape of the DataFrame: (810, 3)
Shape after setting index: (810, 2)


KeyError: 'sentiment'

In [None]:
# Assuming the correct date column name is 'date_column_name'
date_column_name = 'end_date'  # Replace with the actual column name

# Check if the column exists
if date_column_name in merged_df.columns:
    # Convert the date column to datetime
    merged_df[date_column_name] = pd.to_datetime(merged_df[date_column_name])
    merged_df.set_index(date_column_name, inplace=True)

    # Sort the DataFrame by date
    merged_df = merged_df.sort_index()

    print("Shape after setting index:", merged_df.shape)

    # Continue with the rest of your analysis...
else:
    print(f"Column '{date_column_name}' not found in the DataFrame.")

In [None]:
merged_df.head()