In [1]:
import pandas as pd
import numpy as np
import os
import json
import seaborn as sns
import chart_studio.plotly as py
import plotly.graph_objects as go
import geopandas as gpd


# from matplotlib import pyplot as plt
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.options.display.max_colwidth = None
pd.set_option("display.float_format", lambda x: '%.2f' % x)

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
FILE_CSV = "csv_file.csv"
FILE_TXT = "txt_file.txt"
JSON_FILE = "json_file.json"
DATA_FOLDER = "data"

In [3]:
main_path = os.getcwd()
main_path

In [4]:
file_csvpath = os.path.join(os.path.join(main_path, DATA_FOLDER), FILE_CSV)
file_txtpath = os.path.join(os.path.join(main_path, DATA_FOLDER), FILE_TXT)
file_jsonpath = os.path.join(os.path.join(main_path, DATA_FOLDER), JSON_FILE)

In [5]:
%%time

csv_file = pd.read_csv(file_csvpath)
txt_file = pd.read_csv(file_txtpath)


In [6]:
with open(file_jsonpath) as f:
    json_file = json.load(f)

In [7]:
def load_json(file_path):
    """ Loads json file """
    try:
        with open(file_path) as f:
            json_file = json.load(f)
            return json_file
    except Exception as e:
        print(str(e))

In [8]:
json_file

In [9]:
type(json_file)

In [10]:
some_data = pd.DataFrame(json_file)

In [11]:
for i in json_file[0:5]:
    print(i["author"])

In [12]:
some_data.head()

In [13]:
# Use json lambda to open the dictionary
def get_author_details(text):
    
    friends = text["properties"]["friends"]
    verified = text["properties"]["verified"]
    status_count = text["properties"]["status_count"]
    
    # friends = f"{friends}"
    # verified = f"{verified}" 
    # status_count = f"{status_count}"
    
    return friends, status_count, verified

In [14]:
some_data["result"] = some_data["author"].apply(lambda x:get_author_details(x))

In [15]:
some_data.head()

In [16]:
jsondata = pd.json_normalize(json_file)

In [17]:
jsondata.head()

In [18]:
jsondata.info()

In [19]:
txt_file

In [20]:
csv_file

In [21]:
json_txt_data = pd.concat([txt_file, jsondata], ignore_index=True)

In [22]:
json_txt_data.head()

In [23]:
jsondata.shape

In [24]:
txt_file.shape

In [25]:
json_txt_data.shape

In [26]:
json_txt_data.info()

In [27]:
json_txt_csv_data = pd.concat([json_txt_data, csv_file], ignore_index=True)

In [28]:
json_txt_csv_data.head()

In [29]:
json_txt_csv_data.shape

In [30]:
author_data = json_txt_csv_data

In [31]:
author_data.head()

In [32]:
author_data.columns

In [33]:
author_data.rename(
    columns = {
        
        'author.properties.friends':'friends',
        'author.properties.verified':'verified',
        'properties.sentiment':'sentiment',
        'location.longitude':'longitude',
        'location.latitude':'latitude',
        'author.properties.status_count':'status_count',
        'location.country':'country',
        'content.body':'tweet',
        'properties.platform':'platform'
        
    }, inplace = True)


In [34]:
author_data["sentiment"].unique()

In [35]:
author_data.columns

In [36]:
author_data.head()

# Exploratory Data Analysis

In [37]:
sns.catplot(x='friends', y='status_count', data=author_data, height=8, aspect=12/8)

In [38]:
sns.heatmap(author_data.corr(), cbar = True, linewidths = 0.3)

In [39]:
sns.barplot(data=author_data, y="longitude", x = "sentiment")
plt.show()

In [40]:
author_data.isnull().sum()

In [41]:
author_data.info()

In [42]:
# author_data["friends"].value_counts().to_frame()

In [43]:
author_data[author_data["friends"] == '|| TELL ME YOUR NAME! XD']

In [44]:
author_data_new = author_data.dropna()

In [45]:
author_data_new.info()

In [46]:
author_data_new.info()

In [47]:
author_data_new.isnull().sum()

In [48]:
author_data_new.head()

In [49]:
author_data_new["friends"] = author_data_new["friends"].astype(int)

In [50]:
sns.catplot(data=author_data_new, kind="bar", x="verified", y="friends", hue="sentiment")

In [51]:
# From GeoPandas, our world map data
#worldmap = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Creating axes and plotting world map
#fig, ax = plt.subplots(figsize=(12, 6))
#author_data.plot(color="lightgrey", ax=ax)

# Plotting our Impact Energy data with a color map
x = author_data['longitude']
y = author_data['status_count']
#z = author_data['status_count']
plt.scatter(x, y, s=20*z, c=z, alpha=0.6, vmin=0,cmap='autumn')
#plt.scatter(x, y, s = 10, c = 'red', marker = 'o', alpha = 1, cmap='autumn')


plt.colorbar(label='Peoples Status Count Based on location')

# Creating axis limits and title
#plt.xlim([-180, 180])
#plt.ylim([-90, 90])

plt.title("Author: Status Count Based on location")
plt.xlabel("Longitude")
plt.ylabel("Status count")
plt.show()

In [None]:
#import matplotlib.pyplot as plt
plt.scatter(x=author_data['longitude'], y=author_data['latitude'])
plt.savefig("images/location.png")
plt.show()

In [None]:
sns.catplot(data=author_data_new, kind="bar", x="friends", y="status_count")
plt.savefig("images/status of friends.png")

# Modelling

In [None]:
author_data_new.columns

In [None]:
author_data_new.shape

In [None]:
author_data_new.head()

In [None]:
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
train_data, test_data = train_test_split(author_data_new, test_size=0.33, random_state=42)


In [None]:
train_data.shape, test_data.shape

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
%%time

save_path = 'artefacts/models_regression'
time_limit = 60
TARGET = "friends"

predictor = TabularPredictor(label=TARGET, path=save_path, problem_type="regression").fit(train_data, time_limit=time_limit)

In [None]:
%%time

save_path = 'artefacts/models_regression_statuscount'
time_limit = 60
TARGET = "status_count"

predictor = TabularPredictor(label=TARGET, path=save_path, problem_type="regression").fit(train_data, time_limit=time_limit)

In [None]:
from autogluon.multimodal import MultiModalPredictor


In [None]:
predictor.fit_summary()