<a href="https://colab.research.google.com/github/Fizryfu/ML_HA/blob/main/Captsone_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import re
!pip install emoji
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
import torch
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings('ignore')

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m286.7/590.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


# Project


In [3]:
!pip install -q kaggle
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"fizryfuu","key":"b92c79d4491d0eadd6cba734c771c083"}'}

In [4]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle config view

Configuration values from /root/.kaggle
- username: fizryfuu
- path: None
- proxy: None
- competition: None


In [6]:
# Search for Bitcoin datasets on Kaggle
!kaggle datasets list -s "bitcoin tweets"

ref                                                              title                                                     size  lastUpdated                 downloadCount  voteCount  usabilityRating  
---------------------------------------------------------------  --------------------------------------------------  ----------  --------------------------  -------------  ---------  ---------------  
kaushiksuresh147/bitcoin-tweets                                  Bitcoin Tweets                                       728456658  2023-03-10 15:55:30.037000          13023        196  1.0              
alaix14/bitcoin-tweets-20160101-to-20190329                      Bitcoin tweets - 16M tweets                         1700101813  2019-11-23 16:39:11.020000           5720        101  0.9705882        
paul92s/bitcoin-tweets-14m                                       Bitcoin Tweets 1.4M                                  107760855  2018-08-04 15:47:51.353000           1252         24  0.8235294    

In [7]:
# Download the specific dataset (replace with your actual dataset)
!kaggle datasets download -d kaushiksuresh147/bitcoin-tweets

Dataset URL: https://www.kaggle.com/datasets/kaushiksuresh147/bitcoin-tweets
License(s): CC0-1.0


In [8]:
# Unzip the dataset (adjust filename if needed)
!unzip bitcoin-tweets.zip -d bitcoin_data

Archive:  bitcoin-tweets.zip
  inflating: bitcoin_data/Bitcoin_tweets.csv  
  inflating: bitcoin_data/Bitcoin_tweets_dataset_2.csv  


# Dataset Cleaning


In [9]:
import re
from tqdm import tqdm
import pyarrow as pa
import pyarrow.parquet as pq
import emoji
from bs4 import BeautifulSoup
import unicodedata

In [10]:
def load_data_in_chunks(file_path, chunksize=10000, output_parquet="cleaned_data.parquet"):
    """
    Robust CSV reader with error handling for malformed files
    """
    first_chunk = True

    # Use Python's CSV reader with error handling
    try:
        # Try reading with standard parameters first
        chunk_iter = pd.read_csv(
            file_path,
            chunksize=chunksize,
            engine='python',
            on_bad_lines='warn',
            quoting=3  # QUOTE_NONE
        )

        for chunk in tqdm(chunk_iter, desc="Processing chunks"):
            try:
                # Skip empty chunks
                if len(chunk) == 0:
                    continue

                cleaned_chunk = preprocess_tweets(chunk)
                table = pa.Table.from_pandas(cleaned_chunk)

                if first_chunk:
                    pq.write_table(table, output_parquet)
                    first_chunk = False
                else:
                    with pq.ParquetWriter(output_parquet, table.schema) as writer:
                        writer.write_table(table)

            except Exception as e:
                print(f"Error processing chunk: {e}")
                continue

    except Exception as e:
        print(f"Fatal error reading file: {e}")
        return False

    print(f"Successfully processed and saved to {output_parquet}")
    return True

def preprocess_tweets(df):
    """
    Enhanced preprocessing with robust error handling
    """
    # Safely select columns
    available_cols = [col for col in ['text', 'hashtags', 'user_followers', 'date', 'is_retweet']
                     if col in df.columns]
    df = df[available_cols].copy()

    # Initialize missing columns
    for col in ['text', 'hashtags', 'is_retweet']:
        if col not in df.columns:
            df[col] = "" if col == 'text' else [] if col == 'hashtags' else False

    # Convert is_retweet safely
    df['is_retweet'] = pd.to_numeric(df['is_retweet'], errors='coerce').fillna(0).astype(bool)

    # Text processing with error handling
    df['text'] = df['text'].astype(str).apply(
        lambda x: BeautifulSoup(x, 'html.parser').get_text() if pd.notna(x) else ""
    )

    df['text'] = df['text'].apply(
        lambda x: unicodedata.normalize('NFKD', x) if pd.notna(x) else ""
    )

    df['text'] = df['text'].apply(process_emojis)

    df['text'] = df['text'].apply(
        lambda x: re.sub(r'http\S+|@\w+|[^\w\s#\U0001F300-\U0001F6FF]', '', x)
    )

    # Hashtags processing
    df['hashtags'] = df['hashtags'].apply(
        lambda x: safe_eval_hashtags(x) if pd.notna(x) else []
    )

    # Filter retweets if column exists
    if 'is_retweet' in df.columns:
        df = df[~df['is_retweet']].drop(columns=['is_retweet'], errors='ignore')

    return df

def safe_eval_hashtags(x):
    """Safely evaluate hashtag strings"""
    try:
        if isinstance(x, str) and x.startswith('['):
            return eval(x)
        return []
    except:
        return []

def process_emojis(text):
    """Robust emoji processing"""
    if not isinstance(text, str):
        return ""

    emoticon_map = {
        r':\)|:-\)|\(-:|\(:' : '[emoticon:happy]',
        r':\(|:-\(|\)-:|\):' : '[emoticon:sad]',
        r';\)|;-\)': '[emoticon:wink]',
        r':D|:-D': '[emoticon:laugh]',
    }

    for pattern, label in emoticon_map.items():
        text = re.sub(pattern, label, text)

    try:
        return emoji.demojize(text, delimiters=("[emoji:", "]"))
    except:
        return text

# Usage:
success = load_data_in_chunks(
    "Bitcoin_tweets.csv",
    chunksize=10000,
    output_parquet="bitcoin_cleaned.parquet"
)

Fatal error reading file: [Errno 2] No such file or directory: 'Bitcoin_tweets.csv'


In [12]:
load_data_in_chunks("/content/bitcoin_data/Bitcoin_tweets.csv", chunksize=10000, output_parquet="bitcoin_cleaned.parquet")

Processing chunks: 2241it [22:18,  1.67it/s]

Successfully processed and saved to bitcoin_cleaned.parquet





True

In [14]:
df = pd.read_parquet("bitcoin_cleaned.parquet")

print("Basic Stats:")
print(df.describe())

Basic Stats:
        text hashtags user_followers   date
count   2650     2650            109     25
unique    14     2650             51     17
top     None       []      'altcoin'  False
freq    2633        1             30      8
