# Pretraitement
Les fichiers qui utilisent l'extension `.json` sont en fait une liste de dictionnaire python. Il est donc impossible de l'importer avec la librairie `json` puisque les dictionnaires python utilisent des `single quotes` et non des `double quotes` pout les clefs.  

Les fichiers doivent être convertis en csv.  

In [None]:
# Chaque ligne du fichier est un objet JSON sérialisé (un dict Python)

json_seq = {
    'user_id': '76561197970982479', 
    'user_url': 'http://steamcommunity.com/profiles/76561197970982479', 
    'reviews': [
        {
            'funny': '', 
            'posted': 'Posted November 5, 2011.', 
            'last_edited': '', 
            'item_id': '1250', 
            'helpful': 'No ratings yet', 
            'recommend': True, 
            'review': 'Simple yet with great replayability. In my opinion does "zombie" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth "zombie" splattering fun for the whole family. Amazed this sort of FPS is so rare.'
        }, 
        {
            'funny': '', 
            'posted': 'Posted July 15, 2011.', 
            'last_edited': '', 
            'item_id': '22200', 
            'helpful': 'No ratings yet', 
            'recommend': True, 
            'review': "It's unique and worth a playthrough."
        }, 
        {
            'funny': '', 
            'posted': 'Posted April 21, 2011.', 
            'last_edited': '', 
            'item_id': '43110', 
            'helpful': 'No ratings yet', 
            'recommend': True, 
            'review': 'Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!'
        }
    ]
}

In [1]:
# Fonction pour importer les dictionnaires Python en liste

import ast

def import_python_dicts(filename):
    dicts = []
    with open(filename, 'r') as file:
        for line in file:
            try:
                dictionary = ast.literal_eval(line.strip())
                dicts.append(dictionary)
            except (SyntaxError, ValueError) as e:
                print(f"Error parsing line: {line.strip()} - {e}")
    return dicts

In [2]:
# Charger les reviews en dict python
reviews = 'data/australian_user_reviews.json'
reviews_dicts = import_python_dicts(reviews)
print(reviews_dicts[0]['reviews'][1])

# user_id = 0
# Ajouter le user_id à chaque review
for user in reviews_dicts:
    for review in user['reviews']:
        user_id = user['user_id']
        review['user_id'] = user_id
    # user_id += 1
print(reviews_dicts[0]['reviews'][1])


# Mettre chaque review sur une ligne
all_reviews = []
for user in reviews_dicts:
    for review in user['reviews']:
        all_reviews.append(review)


print(all_reviews[1])

{'funny': '', 'posted': 'Posted July 15, 2011.', 'last_edited': '', 'item_id': '22200', 'helpful': 'No ratings yet', 'recommend': True, 'review': "It's unique and worth a playthrough."}
{'funny': '', 'posted': 'Posted July 15, 2011.', 'last_edited': '', 'item_id': '22200', 'helpful': 'No ratings yet', 'recommend': True, 'review': "It's unique and worth a playthrough.", 'user_id': '76561197970982479'}
{'funny': '', 'posted': 'Posted July 15, 2011.', 'last_edited': '', 'item_id': '22200', 'helpful': 'No ratings yet', 'recommend': True, 'review': "It's unique and worth a playthrough.", 'user_id': '76561197970982479'}


In [3]:
from datetime import datetime

print(all_reviews[7])
print(all_reviews[48950])
print(len(all_reviews))


for review in all_reviews:
    review['funny'] = 0 if review['funny'] == '' else int(review['funny'].split(' ')[0].replace(',', ''))
    review['posted'] = review['posted'][7:-1]
    review['last_edited'] = review['last_edited'][12:-1] if review['last_edited'] != '' else review['posted']
    review['helpful'] = 0 if review['helpful'] == 'No ratings yet' else int(review['helpful'].split(' ')[0].replace(',', ''))

    if not '201' in review['posted']:
        review['posted'] += ', 2016'

    if not '201' in review['last_edited']:
        review['last_edited'] += ', 2016'

    # replace multiple spaces with single space
    review['review'] = ' '.join(review['review'].split())


# to datetime
for review in all_reviews:
    review['posted'] = datetime.strptime(review['posted'], '%B %d, %Y')
    review['last_edited'] = datetime.strptime(review['last_edited'], '%B %d, %Y')
    
    review['posted'] = review['posted'].strftime('%Y-%m-%d')
    review['last_edited'] = review['last_edited'].strftime('%Y-%m-%d')


print(all_reviews[7])
print(all_reviews[48950])
print(len(all_reviews))

{'funny': '', 'posted': 'Posted December 4, 2015.', 'last_edited': 'Last edited December 5, 2015.', 'item_id': '370360', 'helpful': 'No ratings yet', 'recommend': True, 'review': '"Run for fun? What the hell kind of fun is that?"', 'user_id': 'evcentric'}
{'funny': '', 'posted': 'Posted March 3.', 'last_edited': '', 'item_id': '299460', 'helpful': '3 of 7 people (43%) found this review helpful', 'recommend': True, 'review': 'Купила за 79 рублей. Всем рекомендую.', 'user_id': 'BecHyIIIka'}
59305
{'funny': 0, 'posted': '2015-12-04', 'last_edited': '2015-12-05', 'item_id': '370360', 'helpful': 0, 'recommend': True, 'review': '"Run for fun? What the hell kind of fun is that?"', 'user_id': 'evcentric'}
{'funny': 0, 'posted': '2016-03-03', 'last_edited': '2016-03-03', 'item_id': '299460', 'helpful': 3, 'recommend': True, 'review': 'Купила за 79 рублей. Всем рекомендую.', 'user_id': 'BecHyIIIka'}
59305


In [4]:
import pandas as pd

df = pd.DataFrame(all_reviews)
# Mettre 'user_id' en premier
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
print(df.head(2))



             user_id  funny      posted last_edited item_id  helpful  \
0  76561197970982479      0  2011-11-05  2011-11-05    1250        0   
1  76561197970982479      0  2011-07-15  2011-07-15   22200        0   

   recommend                                             review  
0       True  Simple yet with great replayability. In my opi...  
1       True               It's unique and worth a playthrough.  


In [None]:
# enlever les chars spéciaux
# retire les commentaires russes....

# df['review'] = df["review"].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))
# print(df.head(2))

   user_id  funny      posted last_edited item_id  helpful  recommend  \
0        0      0  2011-11-05  2011-11-05    1250        0       True   
1        0      0  2011-07-15  2011-07-15   22200        0       True   

                                              review  
0  Simple yet with great replayability. In my opi...  
1               It's unique and worth a playthrough.  


In [6]:
df.to_csv('data_csv/aus_reviews.csv', index=False)