# Attempt to identify a clean sample of Tweets and labels

To validate a model for the classification of Tweets as "raining" or "not raining", the lack of data quality has been identified as a major obstacle. Therefore, we would like to identify a sample of Tweets, that
- that provide sufficient information to be classified as "raining" / "not raining" (by human/machine/...)
- are labeled as accurately as possible


Results:

- Use tweets that are near weather stations and use weather station measurements as labels
- Optionally, use only Tweets that have a high prediction probability for their respective class (bit circular "cleaning" procedure which will introduce a bias... when evaluating models)

In [None]:
# allows update of external libraries without need to reload package
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import plotly.express
import matplotlib.pyplot as plt
import sklearn.manifold
import gc
import sys
import guppy
import tqdm
import memory_profiler
import torch
import openTSNE
import xarray
import plotly

import a2.training.training_hugging
import a2.training.evaluate_hugging
import a2.training.dataset_hugging
import a2.plotting.analysis
import a2.plotting.histograms
import a2.dataset
import a2.utils

In [None]:
FOLDER_MODEL_PRETRAINED = "../../models/model_weights/output_rainprediction_simpledeberta/era5/checkpoint-7617/"
FOLDER_MODEL = "microsoft/deberta-v3-small"
# FILE_TWEETS = "../../../maelstrom_bootcamp/Applications/AP2/bootcamp2022_data/tweets/tweets_2017_01_era5_normed_filtered.nc"
FOLDER_TWEETS = "/home/kristian/Projects/a2/data/tweets/"
FILE_TWEETS = (
    FOLDER_TWEETS
    + "2017_2020_tweets_rain_sun_vocab_emojis_locations_bba_Tp_era5_no_bots_normalized_filtered_weather_stations_fix_predicted_simpledeberta_radar.nc"
)
FOLDER_EMBEDDINGS = "/home/kristian/Projects/a2/data/embeddings/cls_token/"
FILE_EMBEDDINGS = FOLDER_EMBEDDINGS + "cls_tokenstweets_2017_era5_normed_filtered.nc.npy"
!ls $FILE_TWEETS

In [None]:
ds_raw = a2.dataset.load_dataset.load_tweets_dataset(FILE_TWEETS)
ds = ds_raw.where(ds_raw["created_at"].dt.year != 2020, drop=True)
ds["raining"] = (["index"], np.array(ds.tp_h.values > 1e-8, dtype=int))

In [None]:
a2.plotting.analysis.plot_prediction_certainty(ds["raining"].values, ds["prediction_probability_raining"].values)

In [None]:
ds["prediction_probability_raining"].plot.hist(bins=100)

In [None]:
ds_NR_TN = ds.where((ds["prediction_probability_raining"] < 0.3) & (ds["raining"] == 0), drop=True)
ds_R_TP = ds.where((ds["prediction_probability_raining"] > 0.8) & (ds["raining"] == 1), drop=True)

In [None]:
ds_undecided = ds.where(
    (ds["prediction_probability_raining"] > 0.4) & (ds["prediction_probability_raining"] < 0.6), drop=True
)

In [None]:
ds["raining_station"] = (["index"], ds["station_tp_mm"].values > 0)

In [None]:
df = ds.to_dataframe()

In [None]:
spacing = 0.12
a2.plotting.histograms.plot_histogram_2d(
    x="station_distance_km",
    y="prediction_probability_raining",
    ds=df,
    # facet_column='raining',
    facet_row="raining_station",
    n_bins=[40, 40],
    xlim=[0, 3],
    ylim=[0, 1],
    spacing_x=spacing,
    spacing_y=spacing,
    font_size=10,
    filename="/tmp/test.pdf",
)

In [None]:
plotly.express.density_heatmap(
    df,
    x="station_distance_km",
    y="prediction_probability_raining",
    facet_col="raining",
    facet_row="raining_station",
    nbinsx=400,
    nbinsy=100,
    range_x=[0, 3],
    range_y=[0, 1],
)

In [None]:
ds_near_station = ds.where(
    (ds["station_distance_km"] < 1) & (~a2.dataset.utils_dataset.is_nan(ds, "station_tp_mm")), drop=True
)
ds_near_station.index.shape

In [None]:
ds_near_station

In [None]:
ds_near_station

In [None]:
a2.plotting.histograms.plot_histogram_2d(
    ds=ds_near_station, y="raining_station", x="station_tp_mm", xlim=[0, 1], norm="log"
)

In [None]:
a2.dataset.load_dataset.save_dataset(ds_near_station, "../../data/tweets/2017_2020_tweets_keywords_near_station.nc")

In [None]:
ds_near_station.where(ds_near_station["station_tp_mm"] == 1, drop=True)[["text", "text_normalized"]]

In [None]:
ds_near_station["station_tp_mm"]

In [None]:
result = a2.plotting.analysis.classification_report(
    ds_near_station["raining_station"].values, ds_near_station["raining"], output_dict=False
)
print(result)

In [None]:
result = a2.plotting.analysis.classification_report(
    ds_near_station["raining_station"].values,
    ds_near_station["prediction_probability_raining"].values > 0.8,
    output_dict=False,
)
print(result)

In [None]:
a2.plotting.histograms.plot_histogram(
    ds_near_station.where(~a2.dataset.utils_dataset.is_na(ds_near_station, "full_name"), drop=True)["full_name"].values,
    log=[False, False],
    min_counts=40,
)

In [None]:
a2.plotting.histograms.plot_histogram(ds_near_station["bounding_box_area"].values)

In [None]:
ds.where(
    (ds["raining_station"] == 1) & (ds["raining"] == 0) & (ds["prediction_probability_raining"] > 0.8), drop=True
).text.values[100:140]

In [None]:
ds_near_station.text_normalized.values[:100]

In [None]:
ds_near_station.where(
    (ds_near_station["raining_station"] == 1) & (ds_near_station["prediction_probability_raining"] > 0.8), drop=True
).text_normalized.values[100:140]

In [None]:
ds_near_station.where(
    (ds_near_station["raining_station"] == 0) & (ds_near_station["prediction_probability_raining"] < 0.2), drop=True
).text_normalized.values[100:140]

In [None]:
a2.plotting.analysis.plot_prediction_certainty(
    ds_near_station["raining_station"].values, ds_near_station["prediction_probability_raining"].values
)

In [None]:
a2.plotting.histograms.plot_histogram_2d(
    ds["station_distance_km"].values, ds["raining_station"].values - ds["raining"].values
)

In [None]:
!ls $FOLDER_TWEETS

In [None]:
FILE_TWEETS

In [None]:
# Function to download an instagram photo or video
import re
import requests
from datetime import datetime
from tqdm import tqdm


def download_image_video(url):

    x = re.match(r"^(https:)[/][/]www.([^/]+[.])*instagram.com", url)

    # try:
    if x:
        request_image = requests.get(url)
        return request_image
        src = request_image.content.decode("utf-8")
        check_type = re.search(r'<meta name="medium" content=[\'"]?([^\'" >]+)', src)
        check_type_f = check_type.group()
        final = re.sub('<meta name="medium" content="', "", check_type_f)

        if final == "image":
            print("\nDownloading the image...")
            extract_image_link = re.search(r'meta property="og:image" content=[\'"]?([^\'" >]+)', src)
            print(f"{extract_image_link=}")
            image_link = extract_image_link.group()
            print(f"{image_link=}")
            final = re.sub('meta property="og:image" content="', "", image_link)
            print(f"{final=}")
            _response = requests.get(final).content
            file_size_request = requests.get(final, stream=True)
            file_size = int(file_size_request.headers["Content-Length"])
            block_size = 1024
            filename = datetime.strftime(datetime.now(), "%Y-%m-%d-%H-%M-%S")
            t = tqdm(total=file_size, unit="B", unit_scale=True, desc=filename, ascii=True)
            with open(filename + ".jpg", "wb") as f:
                for data in file_size_request.iter_content(block_size):
                    t.update(len(data))
                    f.write(data)
            t.close()
            print("Image downloaded successfully")

        if final == "video":
            msg = input("You are trying to download a video. Do you want to continue? (Yes or No): ".lower())

            if msg == "yes":
                print("Downloading the video...")
                extract_video_link = re.search(r'meta property="og:video" content=[\'"]?([^\'" >]+)', src)
                video_link = extract_video_link.group()
                final = re.sub('meta property="og:video" content="', "", video_link)
                _response = requests.get(final).content
                file_size_request = requests.get(final, stream=True)
                file_size = int(file_size_request.headers["Content-Length"])
                block_size = 1024
                filename = datetime.strftime(datetime.now(), "%Y-%m-%d-%H-%M-%S")
                t = tqdm(total=file_size, unit="B", unit_scale=True, desc=filename, ascii=True)
                with open(filename + ".mp4", "wb") as f:
                    for data in file_size_request.iter_content(block_size):
                        t.update(len(data))
                        f.write(data)
                t.close()
                print("Video downloaded successfully")

            if msg == "no":
                exit()
    else:
        print("Entered URL is not an instagram.com URL.")
    return src
    # except AttributeError:
    # print("Unknown URL")

In [None]:
src = download_image_video("https://www.instagram.com/p/BVAiK6OFwpK/")

In [None]:
src.raw.headers

In [None]:
dir(src.raw)

In [None]:
# re.findall(r'meta property="og:image" content=[\'"]?([^\'" >]+)', src)#.group()
re.findall(r"object-fit", src)

In [None]:
"https://scontent-ham3-1.cdninstagram.com/v/t51.2885-15/18809724_1881045368817923_3546898024418508800_n.jpg?stp=dst-jpg_e35&_nc_ht=scontent-ham3-1.cdninstagram.com&_nc_cat=111&_nc_ohc=WyPCirPVV9EAX86Rbc1&edm=AP_V10EBAAAA&ccb=7-5&oh=00_AfDG28DzonjDkkXHYM68Ehig_t7N77RWLI_HmCBhb8kycg&oe=6402FE9F&_nc_sid=4f375e"

In [None]:
!ls

In [None]:
img = plt.imread("2023-02-28-16-20-19.jpg")