In [38]:
import pandas as pd

# Load your datasets
poll = pd.read_csv('president_approval.csv')
headlines = pd.read_csv('headlines.csv')

poll = poll.iloc[:, :2]

# Ensure the date columns are in the same format
poll['end_date'] = pd.to_datetime(poll['end_date'])
headlines['Date'] = pd.to_datetime(headlines['Date'])

# Perform the left join
merged_df = pd.merge(poll, headlines, left_on='end_date', right_on='Date', how='left')

# Remove rows where there's no matching date (i.e., where 'Headline' is null)
merged_df = merged_df.dropna(subset=['Headline'])

# If you want to keep only one date column, you can drop the redundant one
merged_df = merged_df.drop('Date', axis=1)

# Display the first few rows to verify
print(merged_df.head())

# Save the result if needed
merged_df.to_csv('merged_poll_headlines.csv', index=False)

    end_date    yes                                           Headline  \
0 2024-12-04  38.00  Detroit’s Mayor, a Democrat, Will Run for Mich...   
1 2024-12-04  38.00  Trump Has ‘Lost Faith’ in N.R.A., Says Gun Gro...   
2 2024-12-04  38.00  Detroit’s Mayor, a Democrat, Will Run for Mich...   
3 2024-12-04  38.00  Trump Has ‘Lost Faith’ in N.R.A., Says Gun Gro...   
4 2024-12-02  37.11    Biden the Father vs. Biden the Institutionalist   

  Published Date  
0     2024-12-04  
1     2024-12-04  
2     2024-12-04  
3     2024-12-04  
4     2024-12-02  


In [45]:
corpus = merged_df['Headline']
corpus.head()

0    Detroit’s Mayor, a Democrat, Will Run for Mich...
1    Trump Has ‘Lost Faith’ in N.R.A., Says Gun Gro...
2    Detroit’s Mayor, a Democrat, Will Run for Mich...
3    Trump Has ‘Lost Faith’ in N.R.A., Says Gun Gro...
4      Biden the Father vs. Biden the Institutionalist
Name: Headline, dtype: object

In [47]:
# import nltk vader library
from nltk.sentiment.vader import SentimentIntensityAnalyzer
    
# initiate an analyzer
sia = SentimentIntensityAnalyzer()

senti_pos = []
senti_neg = []
senti_neu = []
senti_comp = []


# iterate through each sentence in corpus
for sentence in corpus:
    
    #print(sentence)
    
    # analyze the sentiment. ss is a dictionary
    ss = sia.polarity_scores(sentence)
    
    # output each sentiment score (neg, neu, pos, compound) in ss
    #print(ss['pos']) # for debugging
    senti_pos.append(ss['pos'])
    senti_neg.append(ss['neg'])
    senti_neu.append(ss['neu'])
    senti_comp.append(ss['compound'])
    
    # print an empty line as seperator
    #print('\n')

In [50]:
# adding the list to the dataframe as column using assign(column_name = data)
merged_df = merged_df.assign(pos = senti_pos)
merged_df = merged_df.assign(neg = senti_neg)
merged_df = merged_df.assign(neu = senti_neu)
merged_df = merged_df.assign(compound = senti_comp)

In [51]:
X = merged_df[['pos','neg', 'neu', 'compound']]

In [52]:
X.head()

Unnamed: 0,pos,neg,neu,compound
0,0.0,0.0,1.0,0.0
1,0.0,0.211,0.789,-0.34
2,0.0,0.0,1.0,0.0
3,0.0,0.211,0.789,-0.34
4,0.0,0.0,1.0,0.0


In [53]:
X.shape

(40, 4)

In [56]:
# select target
y=merged_df[['yes']]

y.head()

Unnamed: 0,yes
0,38.0
1,38.0
2,38.0
3,38.0
4,37.11


In [57]:
# load the required library
from sklearn.model_selection import train_test_split

# split data into training (70%) and testing (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

In [None]:
# import the library
from sklearn.tree import DecisionTreeClassifier

# initialize the algorithm
dtree=DecisionTreeClassifier(random_state=50)

# Generate a new model using training data only
dtree.fit(X_train,y_train)

In [58]:
# load the required library
from sklearn.model_selection import train_test_split

# split data into training (70%) and testing (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

In [67]:
# import the library
from sklearn.tree import DecisionTreeRegressor

# initialize the algorithm
dtree = DecisionTreeRegressor(max_depth=5, min_samples_split=20, min_samples_leaf=10, random_state=50)

# Generate a new model using training data only
dtree.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=5, min_samples_leaf=10, min_samples_split=20,
                      random_state=50)

In [68]:
# load the required libraries
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  

# make a prediction for the input data
y_pred = dtree.predict(X_test)

In [69]:
# Import the correct library for regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Train the model using training data
dtree.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dtree.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared Score: {r2}")

Mean Squared Error: 6.52700638286215
Root Mean Squared Error: 2.554800654231588
R-squared Score: -0.24488598560549635
