In [None]:
import pandas as pd
from numpy import NaN
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from yahoofinancials import YahooFinancials

In [None]:
# Preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [None]:
def preprocess_text(text):
    if isinstance(text, str):
        # remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)  # r'[^\w\s]' : matches any character that is not a word character (alphanumeric or underscore) or a whitespace character
        # convert to lowercase
        text = text.lower()
        # tokenize text
        tokens = nltk.word_tokenize(text)
        # remove stop words
        tokens = [token for token in tokens if token not in stop_words]
        # lemmatize text
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        # join tokens back into text
        text = ' '.join(tokens)
    return text


In [None]:
# Load the Excel file into a DataFrame
df = pd.read_csv(r"C:\Users\Lenovo\Desktop\dataset_52-person-from-2021-02-05_2023-06-12_21-34-17-266_with_sentiment.csv")

# Remove rows with "na" values
df = df.dropna(subset=['full_text'])

# # Fill missing values in 'full_text' column with an empty string
df['full_text'] = df['full_text'].fillna('')
# # to lower text
df['full_text'] = df['full_text'].str.lower()
# # Preprocess the 'full_text' column
df['clean_text'] = df['full_text'].apply(preprocess_text)
# # Filter the DataFrame to keep rows where "created_at" is greater than or equal to 2021-01-01
import datetime
# #Convert the "created_at" column to datetime format
df['created_at'] = pd.to_datetime(df['created_at'])

# ##add importance_coefficient per tweets
df['importance_coefficient'] = df['retweet_count'] + 2 * df['favorite_count'] + 0.5 * df['reply_count']
# # Find the minimum and maximum values of the importance coefficient
min_value = df['importance_coefficient'].min()
max_value = df['importance_coefficient'].max()

# # Normalize the importance coefficient
df['importance_coefficient_normalized'] = (df['importance_coefficient'] - min_value) / (max_value - min_value)
# # Sort the DataFrame based on the "created_at" column in ascending order
df = df.sort_values('created_at', ascending=True)

# Print the sorted DataFrame
df.head()
df.loc[df["created_at"]=='1/1/2023']


## Model 2: Vader Sentiment analysis

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [None]:
def analyze_sentiment(df):
    # Create a copy of the input DataFrame
    df_selected = df.copy()

    # Initialize the VADER sentiment analyzer
    sid = SentimentIntensityAnalyzer()

    # Calculate sentiment scores and add them to the DataFrame
    df_selected['scores'] = df_selected['full_text'].apply(lambda description: sid.polarity_scores(description))
    df_selected['compound'] = df_selected['scores'].apply(lambda score_dict: score_dict['compound'])
    
    # Create a new column for sentiment_type and classify based on the compound score
    df_selected['sentiment_type'] = df_selected['compound'].apply(lambda avg_compound: 'POSITIVE' if avg_compound > 0 else 'NEUTRAL' if avg_compound == 0 else 'NEGATIVE')

    return df_selected
sentiment_df=analyze_sentiment(df)

In [None]:
import pandas as pd

def crypto_sentiment(df, coin_symbol):
    df = df[df["new_coins"].str.contains(f"({coin_symbol})")].copy()
    df = df[(df["created_at"] >= "2023-01-01") & (df["created_at"] <= "2023-06-12")].sort_values(by="created_at", ascending=True)
    sentiment_df = df.groupby(df['created_at'].dt.date)['compound'].mean().reset_index()

    def classify_sentiment(avg_compound):
        if avg_compound > 0:
            return 'Positive'
        elif avg_compound < 0:
            return 'Negative'
        else:
            return 'Neutral'

    sentiment_df['sentiment_type'] = sentiment_df['compound'].apply(classify_sentiment)

    return sentiment_df



btc_sentiment=crypto_sentiment_result = crypto_sentiment(sentiment_df, "(btc)")
eth_sentiment=crypto_sentiment(sentiment_df, "(eth)")
bnb_sentiment=crypto_sentiment(sentiment_df, "(bnb)")

In [None]:
def get_historical_prices(tickers, start_date, end_date):
    data = {}
    for ticker in tickers:
        yahoo_financials = YahooFinancials(ticker)
        historical_data = yahoo_financials.get_historical_price_data(start_date, end_date, "daily")
        data[ticker] = historical_data[ticker]['prices']
    dfs = []
    for ticker, prices in data.items():
        df = pd.DataFrame(prices)
        df = df.drop('date', axis=1).set_index('formatted_date')
        df.columns = [f"{ticker}_close", f"{ticker}_high", f"{ticker}_low", f"{ticker}_open", f"{ticker}_volume", f"{ticker}_adjclose"]
        df['formatted_date'] = pd.to_datetime(df.index) # Add formatted_date column
        dfs.append(df)
    merged_df = pd.concat(dfs, axis=1)
    return merged_df

In [None]:
tickers = ['BTC-USD', 'ETH-USD', 'BNB-USD','XMR-USD','MATIC-USD','XRP-USD','DAI-USD','DOT-USD']
start_date = '2021-2-1'
end_date = '2023-06-12'

btc_df = get_historical_prices(tickers, start_date, end_date)
# Move formatted_date column to the first position
btc_df = btc_df[["formatted_date"] + [col for col in btc_df.columns if col != "formatted_date"]]
# Delete duplicate formatted_date columns
btc_df = btc_df.loc[:, ~btc_df.columns.duplicated()]
print(btc_df.head())

In [None]:
import matplotlib.pyplot as plt

# Prepare the data
df_draw = btc_df[['formatted_date', 'XRP-USD_volume', 'ETH-USD_volume', 'BTC-USD_volume','XMR-USD_volume'
,'DAI-USD_volume','DOT-USD_volume']]
df_draw['formatted_date'] = pd.to_datetime(df_draw['formatted_date'], format='%Y-%m-%d %I-%p')
df_draw.set_index('formatted_date', inplace=True)

# Create the plot
plt.figure(figsize=(12, 6))  # Set the figure size to 12 inches wide and 6 inches high
plt.plot(df_draw.index, df_draw['XRP-USD_volume'], label='Ripple')
plt.plot(df_draw.index, df_draw['ETH-USD_volume'], label='ETH')
plt.plot(df_draw.index, df_draw['BTC-USD_volume'], label='BTC')
plt.plot(df_draw.index, df_draw['XMR-USD_volume'], label='Monero')
plt.plot(df_draw.index, df_draw['DOT-USD_volume'], label='Polkadot')
#plt.plot(df_draw.index, df_draw['DAI-USD_volume'], label='Dai')
plt.xlabel('Date')
plt.ylabel('Volume')
plt.title('Cryptocurrency Volume Over Time')
plt.legend()



plt.show()

In [None]:
btc_selected = btc_df.iloc[:, :7]
btc_selected = btc_selected.round(0)
btc_selected['formatted_date'] = btc_selected.index

btc_selected.head()
#---ETH-selected------------
eth_selected = btc_df.iloc[:, 7:13]
eth_selected = eth_selected.round(0)
eth_selected['formatted_date'] = eth_selected.index

eth_selected
#--BNB-selected------------
bnb_selected = btc_df.iloc[:, 13:19]
bnb_selected = bnb_selected.round(0)
bnb_selected['formatted_date'] = bnb_selected.index

In [None]:
# Calculate price changes
btc_selected['price_changes'] = btc_selected['BTC-USD_close'].diff()
btc_selected['price_changes'] = btc_selected['price_changes'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')

# Calculate price changes
eth_selected['price_changes'] = eth_selected['ETH-USD_close'].diff()
eth_selected['price_changes'] = eth_selected['price_changes'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')
eth_selected
# Calculate price changes
bnb_selected['price_changes'] = bnb_selected['BNB-USD_close'].diff()
bnb_selected['price_changes'] = bnb_selected['price_changes'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')
bnb_selected

In [None]:
#change columns name
def remove_chars_before_underscore(df):
    df.columns = df.columns.str.split('_').str[-1]
remove_chars_before_underscore(btc_selected)
remove_chars_before_underscore(eth_selected)
remove_chars_before_underscore(bnb_selected)
btc_selected
eth_selected
bnb_selected

In [None]:
## add sentimeni type and compund to dataframe
bnb_selected['date'] = pd.to_datetime(bnb_selected['date'])
bnb_sentiment['created_at'] = pd.to_datetime(bnb_sentiment['created_at'])

# Perform left merge on 'date' and 'created_at' columns
bnb_selected = pd.merge(bnb_selected, bnb_sentiment[['created_at', 'compound', 'sentiment_type']],
                     left_on='date', right_on='created_at', how='left')

# Drop the redundant 'created_at' column
bnb_selected = bnb_selected.drop('created_at', axis=1)



# add sentimeni type and compund to dataframe
eth_selected['date'] = pd.to_datetime(eth_selected['date'])
eth_sentiment['created_at'] = pd.to_datetime(eth_sentiment['created_at'])

# Perform left merge on 'date' and 'created_at' columns
eth_selected = pd.merge(eth_selected, eth_sentiment[['created_at', 'compound', 'sentiment_type']],
                     left_on='date', right_on='created_at', how='left')

# Drop the redundant 'created_at' column
eth_selected = eth_selected.drop('created_at', axis=1)

eth_selected

## add sentimeni type and compund to dataframe
btc_selected['date'] = pd.to_datetime(btc_selected['date'])
btc_sentiment['created_at'] = pd.to_datetime(btc_sentiment['created_at'])

# Perform left merge on 'date' and 'created_at' columns
btc_selected = pd.merge(btc_selected, btc_sentiment[['created_at', 'compound', 'sentiment_type']],
                     left_on='date', right_on='created_at', how='left')

# Drop the redundant 'created_at' column
btc_selected = btc_selected.drop('created_at', axis=1)

btc_selected

In [None]:
# Drop the top row
bnb_selected = bnb_selected.iloc[1:]
# Output the merged dataframe
bnb_selected



In [20]:
eth_selected.to_csv(r"C:\Users\Lenovo\Desktop\Saturn project\sentiment\bnb_selected_with_sentiment_2023_01_02_2023_06_12.csv")
btc_selected.to_csv(r"C:\Users\Lenovo\Desktop\Saturn project\sentiment\btc_selected_with_sentiment_2023_01_02_2023_06_12.csv")
bnb_selected.to_csv(r"C:\Users\Lenovo\Desktop\Saturn project\sentiment\bnb_selected_with_sentiment_2023_01_02_2023_06_12.csv")

In [23]:
print(eth_selected)

      close    high     low    open       volume  adjclose       date  \
0    1203.0  1193.0  1197.0  1201.0   2399674550    1201.0 2023-01-01   
1    1220.0  1195.0  1201.0  1215.0   3765758498    1215.0 2023-01-02   
2    1219.0  1207.0  1215.0  1215.0   3392972131    1215.0 2023-01-03   
3    1265.0  1213.0  1215.0  1257.0   6404416893    1257.0 2023-01-04   
4    1259.0  1245.0  1256.0  1250.0   4001786456    1250.0 2023-01-05   
..      ...     ...     ...     ...          ...       ...        ...   
158  1861.0  1830.0  1833.0  1846.0   4536041931    1846.0 2023-06-08   
159  1855.0  1829.0  1846.0  1840.0   4610831509    1840.0 2023-06-09   
160  1845.0  1721.0  1840.0  1752.0  10788500406    1752.0 2023-06-10   
161  1777.0  1741.0  1753.0  1753.0   4559112981    1753.0 2023-06-11   
162  1758.0  1723.0  1753.0  1743.0   6031384958    1743.0 2023-06-12   

      changes  compound sentiment_type  
0     neutral  0.199350       Positive  
1    positive  0.214500       Positive  
